summaryrefslogtreecommitdiff
path: root/drivers/block/drbd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r--drivers/block/drbd/Kconfig73
-rw-r--r--drivers/block/drbd/Makefile8
-rw-r--r--drivers/block/drbd/drbd_actlog.c1219
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1648
-rw-r--r--drivers/block/drbd/drbd_debugfs.c958
-rw-r--r--drivers/block/drbd/drbd_debugfs.h39
-rw-r--r--drivers/block/drbd/drbd_int.h2328
-rw-r--r--drivers/block/drbd/drbd_interval.c179
-rw-r--r--drivers/block/drbd/drbd_interval.h42
-rw-r--r--drivers/block/drbd/drbd_main.c3844
-rw-r--r--drivers/block/drbd/drbd_nl.c3674
-rw-r--r--drivers/block/drbd/drbd_nla.c54
-rw-r--r--drivers/block/drbd/drbd_nla.h8
-rw-r--r--drivers/block/drbd/drbd_proc.c368
-rw-r--r--drivers/block/drbd/drbd_protocol.h307
-rw-r--r--drivers/block/drbd/drbd_receiver.c5661
-rw-r--r--drivers/block/drbd/drbd_req.c1638
-rw-r--r--drivers/block/drbd/drbd_req.h351
-rw-r--r--drivers/block/drbd/drbd_state.c1918
-rw-r--r--drivers/block/drbd/drbd_state.h166
-rw-r--r--drivers/block/drbd/drbd_strings.c118
-rw-r--r--drivers/block/drbd/drbd_strings.h9
-rw-r--r--drivers/block/drbd/drbd_vli.h351
-rw-r--r--drivers/block/drbd/drbd_worker.c2156
24 files changed, 27117 insertions, 0 deletions
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000..7845bd6ee
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,73 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS or INET not selected"
+ depends on PROC_FS='n' || INET='n'
+
+config BLK_DEV_DRBD
+ tristate "DRBD Distributed Replicated Block Device support"
+ depends on PROC_FS && INET
+ select LRU_CACHE
+ select LIBCRC32C
+ default n
+ help
+
+ NOTE: In order to authenticate connections you have to select
+ CRYPTO_HMAC and a hash function as well.
+
+ DRBD is a shared-nothing, synchronously replicated block device. It
+ is designed to serve as a building block for high availability
+ clusters and in this context, is a "drop-in" replacement for shared
+ storage. Simplistically, you could see it as a network RAID 1.
+
+ Each minor device has a role, which can be 'primary' or 'secondary'.
+ On the node with the primary device the application is supposed to
+ run and to access the device (/dev/drbdX). Every write is sent to
+ the local 'lower level block device' and, across the network, to the
+ node with the device in 'secondary' state. The secondary device
+ simply writes the data to its lower level block device.
+
+ DRBD can also be used in dual-Primary mode (device writable on both
+ nodes), which means it can exhibit shared disk semantics in a
+ shared-nothing cluster. Needless to say, on top of dual-Primary
+ DRBD utilizing a cluster file system is necessary to maintain for
+ cache coherency.
+
+ For automatic failover you need a cluster manager (e.g. heartbeat).
+ See also: http://www.drbd.org/, http://www.linux-ha.org
+
+ If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+ bool "DRBD fault injection"
+ depends on BLK_DEV_DRBD
+ help
+
+ Say Y here if you want to simulate IO errors, in order to test DRBD's
+ behavior.
+
+ The actual simulation of IO errors is done by writing 3 values to
+ /sys/module/drbd/parameters/
+
+ enable_faults: bitmask of...
+ 1 meta data write
+ 2 read
+ 4 resync data write
+ 8 read
+ 16 data write
+ 32 data read
+ 64 read ahead
+ 128 kmalloc of bitmap
+ 256 allocation of peer_requests
+ 512 insert data corruption on receiving side
+
+ fault_devs: bitmask of minor numbers
+ fault_rate: frequency in percent
+
+ Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+ echo 16 > /sys/module/drbd/parameters/enable_faults
+ echo 1 > /sys/module/drbd/parameters/fault_devs
+ echo 5 > /sys/module/drbd/parameters/fault_rate
+
+ If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000..4464e353c
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,8 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
+
+obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000..1318e3217
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1219 @@
+/*
+ drbd_actlog.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include <linux/dynamic_debug.h>
+#include "drbd_int.h"
+
+
+enum al_transaction_types {
+ AL_TR_UPDATE = 0,
+ AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+ /* don't we all like magic */
+ __be32 magic;
+
+ /* to identify the most recent transaction block
+ * in the on disk ring buffer */
+ __be32 tr_number;
+
+ /* checksum on the full 4k block, with this field set to 0. */
+ __be32 crc32c;
+
+ /* type of transaction, special transaction types like:
+ * purge-all, set-all-idle, set-all-active, ... to-be-defined
+ * see also enum al_transaction_types */
+ __be16 transaction_type;
+
+ /* we currently allow only a few thousand extents,
+ * so 16bit will be enough for the slot number. */
+
+ /* how many updates in this transaction */
+ __be16 n_updates;
+
+ /* maximum slot number, "al-extents" in drbd.conf speak.
+ * Having this in each transaction should make reconfiguration
+ * of that parameter easier. */
+ __be16 context_size;
+
+ /* slot number the context starts with */
+ __be16 context_start_slot_nr;
+
+ /* Some reserved bytes. Expected usage is a 64bit counter of
+ * sectors-written since device creation, and other data generation tag
+ * supporting usage */
+ __be32 __reserved[4];
+
+ /* --- 36 byte used --- */
+
+ /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+ * in one transaction, then use the remaining byte in the 4k block for
+ * context information. "Flexible" number of updates per transaction
+ * does not help, as we have to account for the case when all update
+ * slots are used anyways, so it would only complicate code without
+ * additional benefit.
+ */
+ __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* but the extent number is 32bit, which at an extent size of 4 MiB
+ * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+ __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* --- 420 bytes used (36 + 64*6) --- */
+
+ /* 4096 - 420 = 3676 = 919 * 4 */
+ __be32 context[AL_CONTEXT_PER_TRANSACTION];
+};
+
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
+{
+ int r;
+
+ wait_event(device->misc_wait,
+ (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
+ device->state.disk <= D_FAILED);
+
+ if (r)
+ return NULL;
+
+ device->md_io.current_use = intent;
+ device->md_io.start_jif = jiffies;
+ device->md_io.submit_jif = device->md_io.start_jif - 1;
+ return page_address(device->md_io.page);
+}
+
+void drbd_md_put_buffer(struct drbd_device *device)
+{
+ if (atomic_dec_and_test(&device->md_io.in_use))
+ wake_up(&device->misc_wait);
+}
+
+void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ unsigned int *done)
+{
+ long dt;
+
+ rcu_read_lock();
+ dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+ rcu_read_unlock();
+ dt = dt * HZ / 10;
+ if (dt == 0)
+ dt = MAX_SCHEDULE_TIMEOUT;
+
+ dt = wait_event_timeout(device->misc_wait,
+ *done || test_bit(FORCE_DETACH, &device->flags), dt);
+ if (dt == 0) {
+ drbd_err(device, "meta-data IO operation timed out\n");
+ drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH);
+ }
+}
+
+static int _drbd_md_sync_page_io(struct drbd_device *device,
+ struct drbd_backing_dev *bdev,
+ sector_t sector, int rw)
+{
+ struct bio *bio;
+ /* we do all our meta data IO in aligned 4k blocks. */
+ const int size = 4096;
+ int err;
+
+ device->md_io.done = 0;
+ device->md_io.error = -ENODEV;
+
+ if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
+ rw |= REQ_FUA | REQ_FLUSH;
+ rw |= REQ_SYNC | REQ_NOIDLE;
+
+ bio = bio_alloc_drbd(GFP_NOIO);
+ bio->bi_bdev = bdev->md_bdev;
+ bio->bi_iter.bi_sector = sector;
+ err = -EIO;
+ if (bio_add_page(bio, device->md_io.page, size, 0) != size)
+ goto out;
+ bio->bi_private = device;
+ bio->bi_end_io = drbd_md_endio;
+ bio->bi_rw = rw;
+
+ if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL)
+ /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+ ;
+ else if (!get_ldev_if_state(device, D_ATTACHING)) {
+ /* Corresponding put_ldev in drbd_md_endio() */
+ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+ err = -ENODEV;
+ goto out;
+ }
+
+ bio_get(bio); /* one bio_put() is in the completion handler */
+ atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+ device->md_io.submit_jif = jiffies;
+ if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+ bio_endio(bio, -EIO);
+ else
+ submit_bio(rw, bio);
+ wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
+ if (bio_flagged(bio, BIO_UPTODATE))
+ err = device->md_io.error;
+
+ out:
+ bio_put(bio);
+ return err;
+}
+
+int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ sector_t sector, int rw)
+{
+ int err;
+ D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
+
+ BUG_ON(!bdev->md_bdev);
+
+ dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+ (void*)_RET_IP_ );
+
+ if (sector < drbd_md_first_sector(bdev) ||
+ sector + 7 > drbd_md_last_sector(bdev))
+ drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+ err = _drbd_md_sync_page_io(device, bdev, sector, rw);
+ if (err) {
+ drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
+ }
+ return err;
+}
+
+static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr)
+{
+ struct lc_element *tmp;
+ tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+ return bm_ext;
+ }
+ return NULL;
+}
+
+static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock)
+{
+ struct lc_element *al_ext;
+ struct bm_extent *bm_ext;
+ int wake;
+
+ spin_lock_irq(&device->al_lock);
+ bm_ext = find_active_resync_extent(device, enr);
+ if (bm_ext) {
+ wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+ spin_unlock_irq(&device->al_lock);
+ if (wake)
+ wake_up(&device->al_wait);
+ return NULL;
+ }
+ if (nonblock)
+ al_ext = lc_try_get(device->act_log, enr);
+ else
+ al_ext = lc_get(device->act_log, enr);
+ spin_unlock_irq(&device->al_lock);
+ return al_ext;
+}
+
+bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+
+ D_ASSERT(device, (unsigned)(last - first) <= 1);
+ D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+ /* FIXME figure out a fast path for bios crossing AL extent boundaries */
+ if (first != last)
+ return false;
+
+ return _al_get(device, first, true);
+}
+
+bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ bool need_transaction = false;
+
+ D_ASSERT(device, first <= last);
+ D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ wait_event(device->al_wait,
+ (al_ext = _al_get(device, enr, false)) != NULL);
+ if (al_ext->lc_number != enr)
+ need_transaction = true;
+ }
+ return need_transaction;
+}
+
+static int al_write_transaction(struct drbd_device *device);
+
+void drbd_al_begin_io_commit(struct drbd_device *device)
+{
+ bool locked = false;
+
+ /* Serialize multiple transactions.
+ * This uses test_and_set_bit, memory barrier is implicit.
+ */
+ wait_event(device->al_wait,
+ device->act_log->pending_changes == 0 ||
+ (locked = lc_try_lock_for_transaction(device->act_log)));
+
+ if (locked) {
+ /* Double check: it may have been committed by someone else,
+ * while we have been waiting for the lock. */
+ if (device->act_log->pending_changes) {
+ bool write_al_updates;
+
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+
+ if (write_al_updates)
+ al_write_transaction(device);
+ spin_lock_irq(&device->al_lock);
+ /* FIXME
+ if (err)
+ we need an "lc_cancel" here;
+ */
+ lc_committed(device->act_log);
+ spin_unlock_irq(&device->al_lock);
+ }
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
+ }
+}
+
+/*
+ * @delegate: delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
+{
+ if (drbd_al_begin_io_prepare(device, i))
+ drbd_al_begin_io_commit(device);
+}
+
+int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
+{
+ struct lru_cache *al = device->act_log;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned nr_al_extents;
+ unsigned available_update_slots;
+ unsigned enr;
+
+ D_ASSERT(device, first <= last);
+
+ nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+ available_update_slots = min(al->nr_elements - al->used,
+ al->max_pending_changes - al->pending_changes);
+
+ /* We want all necessary updates for a given request within the same transaction
+ * We could first check how many updates are *actually* needed,
+ * and use that instead of the worst-case nr_al_extents */
+ if (available_update_slots < nr_al_extents) {
+ /* Too many activity log extents are currently "hot".
+ *
+ * If we have accumulated pending changes already,
+ * we made progress.
+ *
+ * If we cannot get even a single pending change through,
+ * stop the fast path until we made some progress,
+ * or requests to "cold" extents could be starved. */
+ if (!al->pending_changes)
+ __set_bit(__LC_STARVING, &device->act_log->flags);
+ return -ENOBUFS;
+ }
+
+ /* Is resync active in this area? */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *tmp;
+ tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
+ return -EBUSY;
+ return -EWOULDBLOCK;
+ }
+ }
+ }
+
+ /* Checkout the refcounts.
+ * Given that we checked for available elements and update slots above,
+ * this has to be successful. */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ al_ext = lc_get_cumulative(device->act_log, enr);
+ if (!al_ext)
+ drbd_info(device, "LOGIC BUG for enr=%u\n", enr);
+ }
+ return 0;
+}
+
+void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ struct lc_element *extent;
+ unsigned long flags;
+
+ D_ASSERT(device, first <= last);
+ spin_lock_irqsave(&device->al_lock, flags);
+
+ for (enr = first; enr <= last; enr++) {
+ extent = lc_find(device->act_log, enr);
+ if (!extent) {
+ drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr);
+ continue;
+ }
+ lc_put(device->act_log, extent);
+ }
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ wake_up(&device->al_wait);
+}
+
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+ return al_enr >>
+ /* bit to page */
+ ((PAGE_SHIFT + 3) -
+ /* al extent number to bit */
+ (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+ const unsigned int stripes = device->ldev->md.al_stripes;
+ const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+
+ /* transaction number, modulo on-disk ring buffer wrap around */
+ unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+
+ /* ... to aligned 4k on disk block */
+ t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+ /* ... to 512 byte sector in activity log */
+ t *= 8;
+
+ /* ... plus offset to the on disk position */
+ return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+
+int al_write_transaction(struct drbd_device *device)
+{
+ struct al_transaction_on_disk *buffer;
+ struct lc_element *e;
+ sector_t sector;
+ int i, mx;
+ unsigned extent_nr;
+ unsigned crc = 0;
+ int err = 0;
+
+ if (!get_ldev(device)) {
+ drbd_err(device, "disk is %s, cannot start al transaction\n",
+ drbd_disk_str(device->state.disk));
+ return -EIO;
+ }
+
+ /* The bitmap write may have failed, causing a state change. */
+ if (device->state.disk < D_INCONSISTENT) {
+ drbd_err(device,
+ "disk is %s, cannot write al transaction\n",
+ drbd_disk_str(device->state.disk));
+ put_ldev(device);
+ return -EIO;
+ }
+
+ /* protects md_io_buffer, al_tr_cycle, ... */
+ buffer = drbd_md_get_buffer(device, __func__);
+ if (!buffer) {
+ drbd_err(device, "disk failed while waiting for md_io buffer\n");
+ put_ldev(device);
+ return -ENODEV;
+ }
+
+ memset(buffer, 0, sizeof(*buffer));
+ buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+ buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+ i = 0;
+
+ /* Even though no one can start to change this list
+ * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+ * lc_try_lock_for_transaction() --, someone may still
+ * be in the process of changing it. */
+ spin_lock_irq(&device->al_lock);
+ list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+ if (i == AL_UPDATES_PER_TRANSACTION) {
+ i++;
+ break;
+ }
+ buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+ buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+ if (e->lc_number != LC_FREE)
+ drbd_bm_mark_for_writeout(device,
+ al_extent_to_bm_page(e->lc_number));
+ i++;
+ }
+ spin_unlock_irq(&device->al_lock);
+ BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+
+ buffer->n_updates = cpu_to_be16(i);
+ for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+ buffer->update_slot_nr[i] = cpu_to_be16(-1);
+ buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+ }
+
+ buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+ buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+
+ mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+ device->act_log->nr_elements - device->al_tr_cycle);
+ for (i = 0; i < mx; i++) {
+ unsigned idx = device->al_tr_cycle + i;
+ extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+ buffer->context[i] = cpu_to_be32(extent_nr);
+ }
+ for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+ buffer->context[i] = cpu_to_be32(LC_FREE);
+
+ device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+ if (device->al_tr_cycle >= device->act_log->nr_elements)
+ device->al_tr_cycle = 0;
+
+ sector = al_tr_number_to_on_disk_sector(device);
+
+ crc = crc32c(0, buffer, 4096);
+ buffer->crc32c = cpu_to_be32(crc);
+
+ if (drbd_bm_write_hinted(device))
+ err = -EIO;
+ else {
+ bool write_al_updates;
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+ if (write_al_updates) {
+ if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+ err = -EIO;
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ } else {
+ device->al_tr_number++;
+ device->al_writ_cnt++;
+ }
+ }
+ }
+
+ drbd_md_put_buffer(device);
+ put_ldev(device);
+
+ return err;
+}
+
+static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
+{
+ int rv;
+
+ spin_lock_irq(&device->al_lock);
+ rv = (al_ext->refcnt == 0);
+ if (likely(rv))
+ lc_del(device->act_log, al_ext);
+ spin_unlock_irq(&device->al_lock);
+
+ return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @device: DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock device->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_device *device)
+{
+ struct lc_element *al_ext;
+ int i;
+
+ D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags));
+
+ for (i = 0; i < device->act_log->nr_elements; i++) {
+ al_ext = lc_element_by_index(device->act_log, i);
+ if (al_ext->lc_number == LC_FREE)
+ continue;
+ wait_event(device->al_wait, _try_lc_del(device, al_ext));
+ }
+
+ wake_up(&device->al_wait);
+}
+
+int drbd_initialize_al(struct drbd_device *device, void *buffer)
+{
+ struct al_transaction_on_disk *al = buffer;
+ struct drbd_md *md = &device->ldev->md;
+ sector_t al_base = md->md_offset + md->al_offset;
+ int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
+ int i;
+
+ memset(al, 0, 4096);
+ al->magic = cpu_to_be32(DRBD_AL_MAGIC);
+ al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
+ al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+
+ for (i = 0; i < al_size_4k; i++) {
+ int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static const char *drbd_change_sync_fname[] = {
+ [RECORD_RS_FAILED] = "drbd_rs_failed_io",
+ [SET_IN_SYNC] = "drbd_set_in_sync",
+ [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
+};
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
+ * potentially pulling in (and recounting the corresponding bits)
+ * this resync extent into the resync extent lru cache.
+ *
+ * Returns whether all bits have been cleared for this resync extent,
+ * precisely: (rs_left <= rs_failed)
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static bool update_rs_extent(struct drbd_device *device,
+ unsigned int enr, int count,
+ enum update_sync_bits_mode mode)
+{
+ struct lc_element *e;
+
+ D_ASSERT(device, atomic_read(&device->local_cnt));
+
+ /* When setting out-of-sync bits,
+ * we don't need it cached (lc_find).
+ * But if it is present in the cache,
+ * we should update the cached bit count.
+ * Otherwise, that extent should be in the resync extent lru cache
+ * already -- or we want to pull it in if necessary -- (lc_get),
+ * then update and check rs_left and rs_failed. */
+ if (mode == SET_OUT_OF_SYNC)
+ e = lc_find(device->resync, enr);
+ else
+ e = lc_get(device->resync, enr);
+ if (e) {
+ struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+ if (ext->lce.lc_number == enr) {
+ if (mode == SET_IN_SYNC)
+ ext->rs_left -= count;
+ else if (mode == SET_OUT_OF_SYNC)
+ ext->rs_left += count;
+ else
+ ext->rs_failed += count;
+ if (ext->rs_left < ext->rs_failed) {
+ drbd_warn(device, "BAD! enr=%u rs_left=%d "
+ "rs_failed=%d count=%d cstate=%s\n",
+ ext->lce.lc_number, ext->rs_left,
+ ext->rs_failed, count,
+ drbd_conn_str(device->state.conn));
+
+ /* We don't expect to be able to clear more bits
+ * than have been set when we originally counted
+ * the set bits to cache that value in ext->rs_left.
+ * Whatever the reason (disconnect during resync,
+ * delayed local completion of an application write),
+ * try to fix it up by recounting here. */
+ ext->rs_left = drbd_bm_e_weight(device, enr);
+ }
+ } else {
+ /* Normally this element should be in the cache,
+ * since drbd_rs_begin_io() pulled it already in.
+ *
+ * But maybe an application write finished, and we set
+ * something outside the resync lru_cache in sync.
+ */
+ int rs_left = drbd_bm_e_weight(device, enr);
+ if (ext->flags != 0) {
+ drbd_warn(device, "changing resync lce: %d[%u;%02lx]"
+ " -> %d[%u;00]\n",
+ ext->lce.lc_number, ext->rs_left,
+ ext->flags, enr, rs_left);
+ ext->flags = 0;
+ }
+ if (ext->rs_failed) {
+ drbd_warn(device, "Kicking resync_lru element enr=%u "
+ "out with rs_failed=%d\n",
+ ext->lce.lc_number, ext->rs_failed);
+ }
+ ext->rs_left = rs_left;
+ ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
+ /* we don't keep a persistent log of the resync lru,
+ * we can commit any change right away. */
+ lc_committed(device->resync);
+ }
+ if (mode != SET_OUT_OF_SYNC)
+ lc_put(device->resync, &ext->lce);
+ /* no race, we are within the al_lock! */
+
+ if (ext->rs_left <= ext->rs_failed) {
+ ext->rs_failed = 0;
+ return true;
+ }
+ } else if (mode != SET_OUT_OF_SYNC) {
+ /* be quiet if lc_find() did not find it. */
+ drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
+ device->resync_locked,
+ device->resync->nr_elements,
+ device->resync->flags);
+ }
+ return false;
+}
+
+void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
+{
+ unsigned long now = jiffies;
+ unsigned long last = device->rs_mark_time[device->rs_last_mark];
+ int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+ if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+ if (device->rs_mark_left[device->rs_last_mark] != still_to_go &&
+ device->state.conn != C_PAUSED_SYNC_T &&
+ device->state.conn != C_PAUSED_SYNC_S) {
+ device->rs_mark_time[next] = now;
+ device->rs_mark_left[next] = still_to_go;
+ device->rs_last_mark = next;
+ }
+ }
+}
+
+/* It is called lazy update, so don't do write-out too often. */
+static bool lazy_bitmap_update_due(struct drbd_device *device)
+{
+ return time_after(jiffies, device->rs_last_bcast + 2*HZ);
+}
+
+static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
+{
+ if (rs_done)
+ set_bit(RS_DONE, &device->flags);
+ /* and also set RS_PROGRESS below */
+ else if (!lazy_bitmap_update_due(device))
+ return;
+
+ drbd_device_post_work(device, RS_PROGRESS);
+}
+
+static int update_sync_bits(struct drbd_device *device,
+ unsigned long sbnr, unsigned long ebnr,
+ enum update_sync_bits_mode mode)
+{
+ /*
+ * We keep a count of set bits per resync-extent in the ->rs_left
+ * caching member, so we need to loop and work within the resync extent
+ * alignment. Typically this loop will execute exactly once.
+ */
+ unsigned long flags;
+ unsigned long count = 0;
+ unsigned int cleared = 0;
+ while (sbnr <= ebnr) {
+ /* set temporary boundary bit number to last bit number within
+ * the resync extent of the current start bit number,
+ * but cap at provided end bit number */
+ unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
+ unsigned long c;
+
+ if (mode == RECORD_RS_FAILED)
+ /* Only called from drbd_rs_failed_io(), bits
+ * supposedly still set. Recount, maybe some
+ * of the bits have been successfully cleared
+ * by application IO meanwhile.
+ */
+ c = drbd_bm_count_bits(device, sbnr, tbnr);
+ else if (mode == SET_IN_SYNC)
+ c = drbd_bm_clear_bits(device, sbnr, tbnr);
+ else /* if (mode == SET_OUT_OF_SYNC) */
+ c = drbd_bm_set_bits(device, sbnr, tbnr);
+
+ if (c) {
+ spin_lock_irqsave(&device->al_lock, flags);
+ cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ count += c;
+ }
+ sbnr = tbnr + 1;
+ }
+ if (count) {
+ if (mode == SET_IN_SYNC) {
+ unsigned long still_to_go = drbd_bm_total_weight(device);
+ bool rs_is_done = (still_to_go <= device->rs_failed);
+ drbd_advance_rs_marks(device, still_to_go);
+ if (cleared || rs_is_done)
+ maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
+ } else if (mode == RECORD_RS_FAILED)
+ device->rs_failed += count;
+ wake_up(&device->al_wait);
+ }
+ return count;
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector. Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+ enum update_sync_bits_mode mode)
+{
+ /* Is called from worker and receiver context _only_ */
+ unsigned long sbnr, ebnr, lbnr;
+ unsigned long count = 0;
+ sector_t esector, nr_sectors;
+
+ /* This would be an empty REQ_FLUSH, be silent. */
+ if ((mode == SET_OUT_OF_SYNC) && size == 0)
+ return 0;
+
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+ drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
+ drbd_change_sync_fname[mode],
+ (unsigned long long)sector, size);
+ return 0;
+ }
+
+ if (!get_ldev(device))
+ return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
+
+ nr_sectors = drbd_get_capacity(device->this_bdev);
+ esector = sector + (size >> 9) - 1;
+
+ if (!expect(sector < nr_sectors))
+ goto out;
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
+
+ lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+ if (mode == SET_IN_SYNC) {
+ /* Round up start sector, round down end sector. We make sure
+ * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
+ if (unlikely(esector < BM_SECT_PER_BIT-1))
+ goto out;
+ if (unlikely(esector == (nr_sectors-1)))
+ ebnr = lbnr;
+ else
+ ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+ sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+ } else {
+ /* We set it out of sync, or record resync failure.
+ * Should not round anything here. */
+ sbnr = BM_SECT_TO_BIT(sector);
+ ebnr = BM_SECT_TO_BIT(esector);
+ }
+
+ count = update_sync_bits(device, sbnr, ebnr, mode);
+out:
+ put_ldev(device);
+ return count;
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr)
+{
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int wakeup = 0;
+ unsigned long rs_flags;
+
+ spin_lock_irq(&device->al_lock);
+ if (device->resync_locked > device->resync->nr_elements/2) {
+ spin_unlock_irq(&device->al_lock);
+ return NULL;
+ }
+ e = lc_get(device->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ if (bm_ext->lce.lc_number != enr) {
+ bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+ bm_ext->rs_failed = 0;
+ lc_committed(device->resync);
+ wakeup = 1;
+ }
+ if (bm_ext->lce.refcnt == 1)
+ device->resync_locked++;
+ set_bit(BME_NO_WRITES, &bm_ext->flags);
+ }
+ rs_flags = device->resync->flags;
+ spin_unlock_irq(&device->al_lock);
+ if (wakeup)
+ wake_up(&device->al_wait);
+
+ if (!bm_ext) {
+ if (rs_flags & LC_STARVING)
+ drbd_warn(device, "Have to wait for element"
+ " (resync LRU too small?)\n");
+ BUG_ON(rs_flags & LC_LOCKED);
+ }
+
+ return bm_ext;
+}
+
+static int _is_in_al(struct drbd_device *device, unsigned int enr)
+{
+ int rv;
+
+ spin_lock_irq(&device->al_lock);
+ rv = lc_is_used(device->act_log, enr);
+ spin_unlock_irq(&device->al_lock);
+
+ return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @device: DRBD device.
+ * @sector: The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ struct bm_extent *bm_ext;
+ int i, sig;
+ bool sa;
+
+retry:
+ sig = wait_event_interruptible(device->al_wait,
+ (bm_ext = _bme_get(device, enr)));
+ if (sig)
+ return -EINTR;
+
+ if (test_bit(BME_LOCKED, &bm_ext->flags))
+ return 0;
+
+ /* step aside only while we are above c-min-rate; unless disabled. */
+ sa = drbd_rs_c_min_rate_throttle(device);
+
+ for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+ sig = wait_event_interruptible(device->al_wait,
+ !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
+ (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));
+
+ if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
+ spin_lock_irq(&device->al_lock);
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
+ device->resync_locked--;
+ wake_up(&device->al_wait);
+ }
+ spin_unlock_irq(&device->al_lock);
+ if (sig)
+ return -EINTR;
+ if (schedule_timeout_interruptible(HZ/10))
+ return -EINTR;
+ goto retry;
+ }
+ }
+ set_bit(BME_LOCKED, &bm_ext->flags);
+ return 0;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @device: DRBD device.
+ * @sector: The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int i;
+ bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+ /* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+ * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+ * need to throttle. There is at most one such half-locked extent,
+ * which is remembered in resync_wenr. */
+
+ if (throttle && device->resync_wenr != enr)
+ return -EAGAIN;
+
+ spin_lock_irq(&device->al_lock);
+ if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
+ /* in case you have very heavy scattered io, it may
+ * stall the syncer undefined if we give up the ref count
+ * when we try again and requeue.
+ *
+ * if we don't give up the refcount, but the next time
+ * we are scheduled this extent has been "synced" by new
+ * application writes, we'd miss the lc_put on the
+ * extent we keep the refcount on.
+ * so we remembered which extent we had to try again, and
+ * if the next requested one is something else, we do
+ * the lc_put here...
+ * we also have to wake_up
+ */
+ e = lc_find(device->resync, device->resync_wenr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ device->resync_wenr = LC_FREE;
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0;
+ device->resync_locked--;
+ }
+ wake_up(&device->al_wait);
+ } else {
+ drbd_alert(device, "LOGIC BUG\n");
+ }
+ }
+ /* TRY. */
+ e = lc_try_get(device->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ if (test_bit(BME_LOCKED, &bm_ext->flags))
+ goto proceed;
+ if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ device->resync_locked++;
+ } else {
+ /* we did set the BME_NO_WRITES,
+ * but then could not set BME_LOCKED,
+ * so we tried again.
+ * drop the extra reference. */
+ bm_ext->lce.refcnt--;
+ D_ASSERT(device, bm_ext->lce.refcnt > 0);
+ }
+ goto check_al;
+ } else {
+ /* do we rather want to try later? */
+ if (device->resync_locked > device->resync->nr_elements-3)
+ goto try_again;
+ /* Do or do not. There is no try. -- Yoda */
+ e = lc_get(device->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (!bm_ext) {
+ const unsigned long rs_flags = device->resync->flags;
+ if (rs_flags & LC_STARVING)
+ drbd_warn(device, "Have to wait for element"
+ " (resync LRU too small?)\n");
+ BUG_ON(rs_flags & LC_LOCKED);
+ goto try_again;
+ }
+ if (bm_ext->lce.lc_number != enr) {
+ bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+ bm_ext->rs_failed = 0;
+ lc_committed(device->resync);
+ wake_up(&device->al_wait);
+ D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+ }
+ set_bit(BME_NO_WRITES, &bm_ext->flags);
+ D_ASSERT(device, bm_ext->lce.refcnt == 1);
+ device->resync_locked++;
+ goto check_al;
+ }
+check_al:
+ for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+ if (lc_is_used(device->act_log, al_enr+i))
+ goto try_again;
+ }
+ set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+ device->resync_wenr = LC_FREE;
+ spin_unlock_irq(&device->al_lock);
+ return 0;
+
+try_again:
+ if (bm_ext) {
+ if (throttle) {
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ device->resync_wenr = LC_FREE;
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0;
+ device->resync_locked--;
+ }
+ wake_up(&device->al_wait);
+ } else
+ device->resync_wenr = enr;
+ }
+ spin_unlock_irq(&device->al_lock);
+ return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->al_lock, flags);
+ e = lc_find(device->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (!bm_ext) {
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
+ return;
+ }
+
+ if (bm_ext->lce.refcnt == 0) {
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, "
+ "but refcnt is 0!?\n",
+ (unsigned long long)sector, enr);
+ return;
+ }
+
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
+ device->resync_locked--;
+ wake_up(&device->al_wait);
+ }
+
+ spin_unlock_irqrestore(&device->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @device: DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_device *device)
+{
+ spin_lock_irq(&device->al_lock);
+
+ if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */
+ lc_reset(device->resync);
+ put_ldev(device);
+ }
+ device->resync_locked = 0;
+ device->resync_wenr = LC_FREE;
+ spin_unlock_irq(&device->al_lock);
+ wake_up(&device->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @device: DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_device *device)
+{
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int i;
+
+ spin_lock_irq(&device->al_lock);
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ /* ok, ->resync is there. */
+ for (i = 0; i < device->resync->nr_elements; i++) {
+ e = lc_element_by_index(device->resync, i);
+ bm_ext = lc_entry(e, struct bm_extent, lce);
+ if (bm_ext->lce.lc_number == LC_FREE)
+ continue;
+ if (bm_ext->lce.lc_number == device->resync_wenr) {
+ drbd_info(device, "dropping %u in drbd_rs_del_all, apparently"
+ " got 'synced' by application io\n",
+ device->resync_wenr);
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ device->resync_wenr = LC_FREE;
+ lc_put(device->resync, &bm_ext->lce);
+ }
+ if (bm_ext->lce.refcnt != 0) {
+ drbd_info(device, "Retrying drbd_rs_del_all() later. "
+ "refcnt=%d\n", bm_ext->lce.refcnt);
+ put_ldev(device);
+ spin_unlock_irq(&device->al_lock);
+ return -EAGAIN;
+ }
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags));
+ lc_del(device->resync, &bm_ext->lce);
+ }
+ D_ASSERT(device, device->resync->used == 0);
+ put_ldev(device);
+ }
+ spin_unlock_irq(&device->al_lock);
+ wake_up(&device->al_wait);
+
+ return 0;
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000..434c77dcc
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1648 @@
+/*
+ drbd_bitmap.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <linux/slab.h>
+#include <asm/kmap_types.h>
+
+#include "drbd_int.h"
+
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name bm_... => internal to implementation, "private".
+ */
+
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50 bytes backend storage (1 PiB)
+ * 1 << (50 - 12) bits needed
+ * 38 --> we need u64 to index and count bits
+ * 1 << (38 - 3) bitmap bytes needed
+ * 35 --> we still need u64 to index and count bytes
+ * (that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2) 32bit longs needed
+ * 33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3) 64bit longs needed
+ * 32 --> we could get away with a 32bit unsigned int to index and count
+ * 64bit long words, but I rather stay with unsigned long for now.
+ * We probably should neither count nor point to bytes or long words
+ * directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ * 22 --> we need that much 4KiB pages of bitmap.
+ * 1 << (22 + 3) --> on a 64bit arch,
+ * we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
+
+ * bitmap storage and IO:
+ * Bitmap is stored little endian on disk, and is kept little endian in
+ * core memory. Currently we still hold the full bitmap in core as long
+ * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ * seems excessive.
+ *
+ * We plan to reduce the amount of in-core bitmap pages by paging them in
+ * and out against their on-disk location as necessary, but need to make
+ * sure we don't cause too much meta data IO, and must not deadlock in
+ * tight memory situations. This needs some more work.
+ */
+
+/*
+ * NOTE
+ * Access to the *bm_pages is protected by bm_lock.
+ * It is safe to read the other members within the lock.
+ *
+ * drbd_bm_set_bits is called from bio_endio callbacks,
+ * We may be called with irq already disabled,
+ * so we need spin_lock_irqsave().
+ * And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+ struct page **bm_pages;
+ spinlock_t bm_lock;
+
+ /* see LIMITATIONS: above */
+
+ unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
+ unsigned long bm_bits;
+ size_t bm_words;
+ size_t bm_number_of_pages;
+ sector_t bm_dev_capacity;
+ struct mutex bm_change; /* serializes resize operations */
+
+ wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
+
+ enum bm_flag bm_flags;
+
+ /* debugging aid, in case we are still racy somewhere */
+ char *bm_why;
+ struct task_struct *bm_task;
+};
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_device *device, const char *func)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!__ratelimit(&drbd_ratelimit_state))
+ return;
+ drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
+ current->comm, task_pid_nr(current),
+ func, b->bm_why ?: "?",
+ b->bm_task->comm, task_pid_nr(b->bm_task));
+}
+
+void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ int trylock_failed;
+
+ if (!b) {
+ drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
+ return;
+ }
+
+ trylock_failed = !mutex_trylock(&b->bm_change);
+
+ if (trylock_failed) {
+ drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
+ current->comm, task_pid_nr(current),
+ why, b->bm_why ?: "?",
+ b->bm_task->comm, task_pid_nr(b->bm_task));
+ mutex_lock(&b->bm_change);
+ }
+ if (BM_LOCKED_MASK & b->bm_flags)
+ drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
+ b->bm_flags |= flags & BM_LOCKED_MASK;
+
+ b->bm_why = why;
+ b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!b) {
+ drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
+ return;
+ }
+
+ if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
+ drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
+
+ b->bm_flags &= ~BM_LOCKED_MASK;
+ b->bm_why = NULL;
+ b->bm_task = NULL;
+ mutex_unlock(&b->bm_change);
+}
+
+/* we store some "meta" info about our pages in page->private */
+/* at a granularity of 4k storage per bitmap bit:
+ * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
+ * 1<<38 bits,
+ * 1<<23 4k bitmap pages.
+ * Use 24 bits as page index, covers 2 peta byte storage
+ * at a granularity of 4k per bit.
+ * Used to report the failed page idx on io error from the endio handlers.
+ */
+#define BM_PAGE_IDX_MASK ((1UL<<24)-1)
+/* this page is currently read in, or written back */
+#define BM_PAGE_IO_LOCK 31
+/* if there has been an IO error for this page */
+#define BM_PAGE_IO_ERROR 30
+/* this is to be able to intelligently skip disk IO,
+ * set if bits have been set since last IO. */
+#define BM_PAGE_NEED_WRITEOUT 29
+/* to mark for lazy writeout once syncer cleared all clearable bits,
+ * we if bits have been cleared since last IO. */
+#define BM_PAGE_LAZY_WRITEOUT 28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT 27
+
+/* store_page_idx uses non-atomic assignment. It is only used directly after
+ * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
+ * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
+ * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
+ * requires it all to be atomic as well. */
+static void bm_store_page_idx(struct page *page, unsigned long idx)
+{
+ BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
+ set_page_private(page, idx);
+}
+
+static unsigned long bm_page_to_idx(struct page *page)
+{
+ return page_private(page) & BM_PAGE_IDX_MASK;
+}
+
+/* As is very unlikely that the same page is under IO from more than one
+ * context, we can get away with a bit per page and one wait queue per bitmap.
+ */
+static void bm_page_lock_io(struct drbd_device *device, int page_nr)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ void *addr = &page_private(b->bm_pages[page_nr]);
+ wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
+}
+
+static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ void *addr = &page_private(b->bm_pages[page_nr]);
+ clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
+ wake_up(&device->bitmap->bm_io_wait);
+}
+
+/* set _before_ submit_io, so it may be reset due to being changed
+ * while this page is in flight... will get submitted later again */
+static void bm_set_page_unchanged(struct page *page)
+{
+ /* use cmpxchg? */
+ clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+ clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static void bm_set_page_need_writeout(struct page *page)
+{
+ set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+}
+
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @device: DRBD device.
+ * @page_nr: the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
+{
+ struct page *page;
+ if (page_nr >= device->bitmap->bm_number_of_pages) {
+ drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
+ page_nr, (int)device->bitmap->bm_number_of_pages);
+ return;
+ }
+ page = device->bitmap->bm_pages[page_nr];
+ set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_unchanged(struct page *page)
+{
+ volatile const unsigned long *addr = &page_private(page);
+ return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
+}
+
+static void bm_set_page_io_err(struct page *page)
+{
+ set_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_clear_page_io_err(struct page *page)
+{
+ clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_set_page_lazy_writeout(struct page *page)
+{
+ set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_lazy_writeout(struct page *page)
+{
+ return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+/* on a 32bit box, this would allow for exactly (2<<38) bits. */
+static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
+{
+ /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+ unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
+ BUG_ON(page_nr >= b->bm_number_of_pages);
+ return page_nr;
+}
+
+static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
+{
+ /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
+ unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
+ BUG_ON(page_nr >= b->bm_number_of_pages);
+ return page_nr;
+}
+
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+ struct page *page = b->bm_pages[idx];
+ return (unsigned long *) kmap_atomic(page);
+}
+
+static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+ return __bm_map_pidx(b, idx);
+}
+
+static void __bm_unmap(unsigned long *p_addr)
+{
+ kunmap_atomic(p_addr);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+ return __bm_unmap(p_addr);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_device*, but for the debug macros I like to have the device around
+ * to be able to report device specific.
+ */
+
+
+static void bm_free_pages(struct page **pages, unsigned long number)
+{
+ unsigned long i;
+ if (!pages)
+ return;
+
+ for (i = 0; i < number; i++) {
+ if (!pages[i]) {
+ pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
+ i, number);
+ continue;
+ }
+ __free_page(pages[i]);
+ pages[i] = NULL;
+ }
+}
+
+static void bm_vk_free(void *ptr, int v)
+{
+ if (v)
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+ struct page **old_pages = b->bm_pages;
+ struct page **new_pages, *page;
+ unsigned int i, bytes, vmalloced = 0;
+ unsigned long have = b->bm_number_of_pages;
+
+ BUG_ON(have == 0 && old_pages != NULL);
+ BUG_ON(have != 0 && old_pages == NULL);
+
+ if (have == want)
+ return old_pages;
+
+ /* Trying kmalloc first, falling back to vmalloc.
+ * GFP_NOIO, as this is called while drbd IO is "suspended",
+ * and during resize or attach on diskless Primary,
+ * we must not block on IO to ourselves.
+ * Context is receiver thread or dmsetup. */
+ bytes = sizeof(struct page *)*want;
+ new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
+ if (!new_pages) {
+ new_pages = __vmalloc(bytes,
+ GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+ if (!new_pages)
+ return NULL;
+ vmalloced = 1;
+ }
+
+ if (want >= have) {
+ for (i = 0; i < have; i++)
+ new_pages[i] = old_pages[i];
+ for (; i < want; i++) {
+ page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+ if (!page) {
+ bm_free_pages(new_pages + have, i - have);
+ bm_vk_free(new_pages, vmalloced);
+ return NULL;
+ }
+ /* we want to know which page it is
+ * from the endio handlers */
+ bm_store_page_idx(page, i);
+ new_pages[i] = page;
+ }
+ } else {
+ for (i = 0; i < want; i++)
+ new_pages[i] = old_pages[i];
+ /* NOT HERE, we are outside the spinlock!
+ bm_free_pages(old_pages + want, have - want);
+ */
+ }
+
+ if (vmalloced)
+ b->bm_flags |= BM_P_VMALLOCED;
+ else
+ b->bm_flags &= ~BM_P_VMALLOCED;
+
+ return new_pages;
+}
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in device->bitmap.
+ */
+int drbd_bm_init(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ WARN_ON(b != NULL);
+ b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+ spin_lock_init(&b->bm_lock);
+ mutex_init(&b->bm_change);
+ init_waitqueue_head(&b->bm_io_wait);
+
+ device->bitmap = b;
+
+ return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_device *device)
+{
+ if (!expect(device->bitmap))
+ return 0;
+ return device->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_device *device)
+{
+ if (!expect(device->bitmap))
+ return;
+ bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
+ bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+ kfree(device->bitmap);
+ device->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
+#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
+#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
+static int bm_clear_surplus(struct drbd_bitmap *b)
+{
+ unsigned long mask;
+ unsigned long *p_addr, *bm;
+ int tmp;
+ int cleared = 0;
+
+ /* number of bits modulo bits per page */
+ tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+ /* mask the used bits of the word containing the last bit */
+ mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+ /* bitmap is always stored little endian,
+ * on disk and in core memory alike */
+ mask = cpu_to_lel(mask);
+
+ p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+ bm = p_addr + (tmp/BITS_PER_LONG);
+ if (mask) {
+ /* If mask != 0, we are not exactly aligned, so bm now points
+ * to the long containing the last bit.
+ * If mask == 0, bm already points to the word immediately
+ * after the last (long word aligned) bit. */
+ cleared = hweight_long(*bm & ~mask);
+ *bm &= mask;
+ bm++;
+ }
+
+ if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+ /* on a 32bit arch, we may need to zero out
+ * a padding long to align with a 64bit remote */
+ cleared += hweight_long(*bm);
+ *bm = 0;
+ }
+ bm_unmap(p_addr);
+ return cleared;
+}
+
+static void bm_set_surplus(struct drbd_bitmap *b)
+{
+ unsigned long mask;
+ unsigned long *p_addr, *bm;
+ int tmp;
+
+ /* number of bits modulo bits per page */
+ tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+ /* mask the used bits of the word containing the last bit */
+ mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+ /* bitmap is always stored little endian,
+ * on disk and in core memory alike */
+ mask = cpu_to_lel(mask);
+
+ p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+ bm = p_addr + (tmp/BITS_PER_LONG);
+ if (mask) {
+ /* If mask != 0, we are not exactly aligned, so bm now points
+ * to the long containing the last bit.
+ * If mask == 0, bm already points to the word immediately
+ * after the last (long word aligned) bit. */
+ *bm |= ~mask;
+ bm++;
+ }
+
+ if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+ /* on a 32bit arch, we may need to zero out
+ * a padding long to align with a 64bit remote */
+ *bm = ~0UL;
+ }
+ bm_unmap(p_addr);
+}
+
+/* you better not modify the bitmap while this is running,
+ * or its results will be stale */
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+ unsigned long *p_addr;
+ unsigned long bits = 0;
+ unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
+ int idx, i, last_word;
+
+ /* all but last page */
+ for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
+ p_addr = __bm_map_pidx(b, idx);
+ for (i = 0; i < LWPP; i++)
+ bits += hweight_long(p_addr[i]);
+ __bm_unmap(p_addr);
+ cond_resched();
+ }
+ /* last (or only) page */
+ last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
+ p_addr = __bm_map_pidx(b, idx);
+ for (i = 0; i < last_word; i++)
+ bits += hweight_long(p_addr[i]);
+ p_addr[last_word] &= cpu_to_lel(mask);
+ bits += hweight_long(p_addr[last_word]);
+ /* 32bit arch, may have an unused padding long */
+ if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
+ p_addr[last_word+1] = 0;
+ __bm_unmap(p_addr);
+ return bits;
+}
+
+/* offset and len in long words.*/
+static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+ unsigned long *p_addr, *bm;
+ unsigned int idx;
+ size_t do_now, end;
+
+ end = offset + len;
+
+ if (end > b->bm_words) {
+ pr_alert("bm_memset end > bm_words\n");
+ return;
+ }
+
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+ idx = bm_word_to_page_idx(b, offset);
+ p_addr = bm_map_pidx(b, idx);
+ bm = p_addr + MLPP(offset);
+ if (bm+do_now > p_addr + LWPP) {
+ pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+ p_addr, bm, (int)do_now);
+ } else
+ memset(bm, c, do_now * sizeof(long));
+ bm_unmap(p_addr);
+ bm_set_page_need_writeout(b->bm_pages[idx]);
+ offset += do_now;
+ }
+}
+
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+ u64 bitmap_sectors;
+ if (ldev->md.al_offset == 8)
+ bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+ else
+ bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+ return bitmap_sectors << (9 + 3);
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long bits, words, owords, obits;
+ unsigned long want, have, onpages; /* number of pages */
+ struct page **npages, **opages = NULL;
+ int err = 0, growing;
+ int opages_vmalloced;
+
+ if (!expect(b))
+ return -ENOMEM;
+
+ drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
+
+ drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
+ (unsigned long long)capacity);
+
+ if (capacity == b->bm_dev_capacity)
+ goto out;
+
+ opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
+
+ if (capacity == 0) {
+ spin_lock_irq(&b->bm_lock);
+ opages = b->bm_pages;
+ onpages = b->bm_number_of_pages;
+ owords = b->bm_words;
+ b->bm_pages = NULL;
+ b->bm_number_of_pages =
+ b->bm_set =
+ b->bm_bits =
+ b->bm_words =
+ b->bm_dev_capacity = 0;
+ spin_unlock_irq(&b->bm_lock);
+ bm_free_pages(opages, onpages);
+ bm_vk_free(opages, opages_vmalloced);
+ goto out;
+ }
+ bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+ /* if we would use
+ words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+ a 32bit host could present the wrong number of words
+ to a 64bit host.
+ */
+ words = ALIGN(bits, 64) >> LN2_BPL;
+
+ if (get_ldev(device)) {
+ u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
+ put_ldev(device);
+ if (bits > bits_on_disk) {
+ drbd_info(device, "bits = %lu\n", bits);
+ drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
+ err = -ENOSPC;
+ goto out;
+ }
+ }
+
+ want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+ have = b->bm_number_of_pages;
+ if (want == have) {
+ D_ASSERT(device, b->bm_pages != NULL);
+ npages = b->bm_pages;
+ } else {
+ if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
+ npages = NULL;
+ else
+ npages = bm_realloc_pages(b, want);
+ }
+
+ if (!npages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock_irq(&b->bm_lock);
+ opages = b->bm_pages;
+ owords = b->bm_words;
+ obits = b->bm_bits;
+
+ growing = bits > obits;
+ if (opages && growing && set_new_bits)
+ bm_set_surplus(b);
+
+ b->bm_pages = npages;
+ b->bm_number_of_pages = want;
+ b->bm_bits = bits;
+ b->bm_words = words;
+ b->bm_dev_capacity = capacity;
+
+ if (growing) {
+ if (set_new_bits) {
+ bm_memset(b, owords, 0xff, words-owords);
+ b->bm_set += bits - obits;
+ } else
+ bm_memset(b, owords, 0x00, words-owords);
+
+ }
+
+ if (want < have) {
+ /* implicit: (opages != NULL) && (opages != npages) */
+ bm_free_pages(opages + want, have - want);
+ }
+
+ (void)bm_clear_surplus(b);
+
+ spin_unlock_irq(&b->bm_lock);
+ if (opages != npages)
+ bm_vk_free(opages, opages_vmalloced);
+ if (!growing)
+ b->bm_set = bm_count_bits(b);
+ drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
+
+ out:
+ drbd_bm_unlock(device);
+ return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+unsigned long _drbd_bm_total_weight(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long s;
+ unsigned long flags;
+
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ s = b->bm_set;
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+
+ return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_device *device)
+{
+ unsigned long s;
+ /* if I don't have a disk, I don't know about out-of-sync status */
+ if (!get_ldev_if_state(device, D_NEGOTIATING))
+ return 0;
+ s = _drbd_bm_total_weight(device);
+ put_ldev(device);
+ return s;
+}
+
+size_t drbd_bm_words(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return 0;
+
+ return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
+ unsigned long *buffer)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr, *bm;
+ unsigned long word, bits;
+ unsigned int idx;
+ size_t end, do_now;
+
+ end = offset + number;
+
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
+ if (number == 0)
+ return;
+ WARN_ON(offset >= b->bm_words);
+ WARN_ON(end > b->bm_words);
+
+ spin_lock_irq(&b->bm_lock);
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+ idx = bm_word_to_page_idx(b, offset);
+ p_addr = bm_map_pidx(b, idx);
+ bm = p_addr + MLPP(offset);
+ offset += do_now;
+ while (do_now--) {
+ bits = hweight_long(*bm);
+ word = *bm | *buffer++;
+ *bm++ = word;
+ b->bm_set += hweight_long(word) - bits;
+ }
+ bm_unmap(p_addr);
+ bm_set_page_need_writeout(b->bm_pages[idx]);
+ }
+ /* with 32bit <-> 64bit cross-platform connect
+ * this is only correct for current usage,
+ * where we _know_ that we are 64 bit aligned,
+ * and know that this function is used in this way, too...
+ */
+ if (end == b->bm_words)
+ b->bm_set -= bm_clear_surplus(b);
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
+ unsigned long *buffer)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr, *bm;
+ size_t end, do_now;
+
+ end = offset + number;
+
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
+
+ spin_lock_irq(&b->bm_lock);
+ if ((offset >= b->bm_words) ||
+ (end > b->bm_words) ||
+ (number <= 0))
+ drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
+ (unsigned long) offset,
+ (unsigned long) number,
+ (unsigned long) b->bm_words);
+ else {
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+ p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
+ bm = p_addr + MLPP(offset);
+ offset += do_now;
+ while (do_now--)
+ *buffer++ = *bm++;
+ bm_unmap(p_addr);
+ }
+ }
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
+
+ spin_lock_irq(&b->bm_lock);
+ bm_memset(b, 0, 0xff, b->bm_words);
+ (void)bm_clear_surplus(b);
+ b->bm_set = b->bm_bits;
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
+
+ spin_lock_irq(&b->bm_lock);
+ bm_memset(b, 0, 0, b->bm_words);
+ b->bm_set = 0;
+ spin_unlock_irq(&b->bm_lock);
+}
+
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
+{
+ struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+ list_del(&ctx->list);
+ spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
+ put_ldev(ctx->device);
+ kfree(ctx);
+}
+
+/* bv_page may be a copy, or may be the original */
+static void drbd_bm_endio(struct bio *bio, int error)
+{
+ struct drbd_bm_aio_ctx *ctx = bio->bi_private;
+ struct drbd_device *device = ctx->device;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?!
+ * do we want to WARN() on this? */
+ if (!error && !uptodate)
+ error = -EIO;
+
+ if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
+ !bm_test_page_unchanged(b->bm_pages[idx]))
+ drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
+
+ if (error) {
+ /* ctx error will hold the completed-last non-zero error code,
+ * in case error codes differ. */
+ ctx->error = error;
+ bm_set_page_io_err(b->bm_pages[idx]);
+ /* Not identical to on disk version of it.
+ * Is BM_PAGE_IO_ERROR enough? */
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
+ error, idx);
+ } else {
+ bm_clear_page_io_err(b->bm_pages[idx]);
+ dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
+ }
+
+ bm_page_unlock_io(device, idx);
+
+ if (ctx->flags & BM_AIO_COPY_PAGES)
+ mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
+
+ bio_put(bio);
+
+ if (atomic_dec_and_test(&ctx->in_flight)) {
+ ctx->done = 1;
+ wake_up(&device->misc_wait);
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+ }
+}
+
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
+{
+ struct bio *bio = bio_alloc_drbd(GFP_NOIO);
+ struct drbd_device *device = ctx->device;
+ struct drbd_bitmap *b = device->bitmap;
+ struct page *page;
+ unsigned int len;
+ unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
+
+ sector_t on_disk_sector =
+ device->ldev->md.md_offset + device->ldev->md.bm_offset;
+ on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+
+ /* this might happen with very small
+ * flexible external meta data device,
+ * or with PAGE_SIZE > 4k */
+ len = min_t(unsigned int, PAGE_SIZE,
+ (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
+
+ /* serialize IO on this page */
+ bm_page_lock_io(device, page_nr);
+ /* before memcpy and submit,
+ * so it can be redirtied any time */
+ bm_set_page_unchanged(b->bm_pages[page_nr]);
+
+ if (ctx->flags & BM_AIO_COPY_PAGES) {
+ page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+ copy_highpage(page, b->bm_pages[page_nr]);
+ bm_store_page_idx(page, page_nr);
+ } else
+ page = b->bm_pages[page_nr];
+ bio->bi_bdev = device->ldev->md_bdev;
+ bio->bi_iter.bi_sector = on_disk_sector;
+ /* bio_add_page of a single page to an empty bio will always succeed,
+ * according to api. Do we want to assert that? */
+ bio_add_page(bio, page, len, 0);
+ bio->bi_private = ctx;
+ bio->bi_end_io = drbd_bm_endio;
+
+ if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+ bio->bi_rw |= rw;
+ bio_endio(bio, -EIO);
+ } else {
+ submit_bio(rw, bio);
+ /* this should not count as user activity and cause the
+ * resync to throttle -- see drbd_rs_should_slow_down(). */
+ atomic_add(len >> 9, &device->rs_sect_ev);
+ }
+}
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+{
+ struct drbd_bm_aio_ctx *ctx;
+ struct drbd_bitmap *b = device->bitmap;
+ int num_pages, i, count = 0;
+ unsigned long now;
+ char ppb[10];
+ int err = 0;
+
+ /*
+ * We are protected against bitmap disappearing/resizing by holding an
+ * ldev reference (caller must have called get_ldev()).
+ * For read/write, we are protected against changes to the bitmap by
+ * the bitmap lock (see drbd_bitmap_io).
+ * For lazy writeout, we don't care for ongoing changes to the bitmap,
+ * as we submit copies of pages anyways.
+ */
+
+ ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
+ if (!ctx)
+ return -ENOMEM;
+
+ *ctx = (struct drbd_bm_aio_ctx) {
+ .device = device,
+ .start_jif = jiffies,
+ .in_flight = ATOMIC_INIT(1),
+ .done = 0,
+ .flags = flags,
+ .error = 0,
+ .kref = { ATOMIC_INIT(2) },
+ };
+
+ if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */
+ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+ kfree(ctx);
+ return -ENODEV;
+ }
+ /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
+ drbd_adm_attach(), after device->ldev was assigned. */
+
+ if (0 == (ctx->flags & ~BM_AIO_READ))
+ WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&ctx->list, &device->pending_bitmap_io);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ num_pages = b->bm_number_of_pages;
+
+ now = jiffies;
+
+ /* let the layers below us try to merge these bios... */
+ for (i = 0; i < num_pages; i++) {
+ /* ignore completely unchanged pages */
+ if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+ break;
+ if (!(flags & BM_AIO_READ)) {
+ if ((flags & BM_AIO_WRITE_HINTED) &&
+ !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+ &page_private(b->bm_pages[i])))
+ continue;
+
+ if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
+ bm_test_page_unchanged(b->bm_pages[i])) {
+ dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
+ continue;
+ }
+ /* during lazy writeout,
+ * ignore those pages not marked for lazy writeout. */
+ if (lazy_writeout_upper_idx &&
+ !bm_test_page_lazy_writeout(b->bm_pages[i])) {
+ dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
+ continue;
+ }
+ }
+ atomic_inc(&ctx->in_flight);
+ bm_page_io_async(ctx, i);
+ ++count;
+ cond_resched();
+ }
+
+ /*
+ * We initialize ctx->in_flight to one to make sure drbd_bm_endio
+ * will not set ctx->done early, and decrement / test it here. If there
+ * are still some bios in flight, we need to wait for them here.
+ * If all IO is done already (or nothing had been submitted), there is
+ * no need to wait. Still, we need to put the kref associated with the
+ * "in_flight reached zero, all done" event.
+ */
+ if (!atomic_dec_and_test(&ctx->in_flight))
+ wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
+ else
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+
+ /* summary for global bitmap IO */
+ if (flags == 0)
+ drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
+ (flags & BM_AIO_READ) ? "READ" : "WRITE",
+ count, jiffies - now);
+
+ if (ctx->error) {
+ drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ err = -EIO; /* ctx->error ? */
+ }
+
+ if (atomic_read(&ctx->in_flight))
+ err = -EIO; /* Disk timeout/force-detach during IO... */
+
+ now = jiffies;
+ if (flags & BM_AIO_READ) {
+ b->bm_set = bm_count_bits(b);
+ drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
+ jiffies - now);
+ }
+ now = b->bm_set;
+
+ if ((flags & ~BM_AIO_READ) == 0)
+ drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+ ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+ return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @device: DRBD device.
+ */
+int drbd_bm_read(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_READ, 0);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @device: DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ */
+int drbd_bm_write(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, 0, 0);
+}
+
+/**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @device: DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @device: DRBD device.
+ * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @device: DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_COPY_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @device: DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * For this to work on 32bit arch with bitnumbers > (1<<32),
+ * we'd need to return u64, and get a whole lot of other places
+ * fixed where we still use unsigned long.
+ *
+ * this returns a bit number, NOT a sector!
+ */
+static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
+ const int find_zero_bit)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr;
+ unsigned long bit_offset;
+ unsigned i;
+
+
+ if (bm_fo > b->bm_bits) {
+ drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+ bm_fo = DRBD_END_OF_BITMAP;
+ } else {
+ while (bm_fo < b->bm_bits) {
+ /* bit offset of the first bit in the page */
+ bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
+ p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
+
+ if (find_zero_bit)
+ i = find_next_zero_bit_le(p_addr,
+ PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+ else
+ i = find_next_bit_le(p_addr,
+ PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+
+ __bm_unmap(p_addr);
+ if (i < PAGE_SIZE*8) {
+ bm_fo = bit_offset + i;
+ if (bm_fo >= b->bm_bits)
+ break;
+ goto found;
+ }
+ bm_fo = bit_offset + PAGE_SIZE*8;
+ }
+ bm_fo = DRBD_END_OF_BITMAP;
+ }
+ found:
+ return bm_fo;
+}
+
+static unsigned long bm_find_next(struct drbd_device *device,
+ unsigned long bm_fo, const int find_zero_bit)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long i = DRBD_END_OF_BITMAP;
+
+ if (!expect(b))
+ return i;
+ if (!expect(b->bm_pages))
+ return i;
+
+ spin_lock_irq(&b->bm_lock);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
+
+ i = __bm_find_next(device, bm_fo, find_zero_bit);
+
+ spin_unlock_irq(&b->bm_lock);
+ return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+ return bm_find_next(device, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+ return bm_find_next(device, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+ /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+ return __bm_find_next(device, bm_fo, 0);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+ /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+ return __bm_find_next(device, bm_fo, 1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+ unsigned long e, int val)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr = NULL;
+ unsigned long bitnr;
+ unsigned int last_page_nr = -1U;
+ int c = 0;
+ int changed_total = 0;
+
+ if (e >= b->bm_bits) {
+ drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+ s, e, b->bm_bits);
+ e = b->bm_bits ? b->bm_bits -1 : 0;
+ }
+ for (bitnr = s; bitnr <= e; bitnr++) {
+ unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
+ if (page_nr != last_page_nr) {
+ if (p_addr)
+ __bm_unmap(p_addr);
+ if (c < 0)
+ bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+ else if (c > 0)
+ bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+ changed_total += c;
+ c = 0;
+ p_addr = __bm_map_pidx(b, page_nr);
+ last_page_nr = page_nr;
+ }
+ if (val)
+ c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+ else
+ c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+ }
+ if (p_addr)
+ __bm_unmap(p_addr);
+ if (c < 0)
+ bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+ else if (c > 0)
+ bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+ changed_total += c;
+ b->bm_set += changed_total;
+ return changed_total;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+ const unsigned long e, int val)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = device->bitmap;
+ int c = 0;
+
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
+ bm_print_lock_info(device);
+
+ c = __bm_change_bits_to(device, s, e, val);
+
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+ return bm_change_bits_to(device, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+ return -bm_change_bits_to(device, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+ int page_nr, int first_word, int last_word)
+{
+ int i;
+ int bits;
+ int changed = 0;
+ unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+ for (i = first_word; i < last_word; i++) {
+ bits = hweight_long(paddr[i]);
+ paddr[i] = ~0UL;
+ changed += BITS_PER_LONG - bits;
+ }
+ kunmap_atomic(paddr);
+ if (changed) {
+ /* We only need lazy writeout, the information is still in the
+ * remote bitmap as well, and is reconstructed during the next
+ * bitmap exchange, if lost locally due to a crash. */
+ bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+ b->bm_set += changed;
+ }
+}
+
+/* Same thing as drbd_bm_set_bits,
+ * but more efficient for a large bit range.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+ /* First set_bit from the first bit (s)
+ * up to the next long boundary (sl),
+ * then assign full words up to the last long boundary (el),
+ * then set_bit up to and including the last bit (e).
+ *
+ * Do not use memset, because we must account for changes,
+ * so we need to loop over the words with hweight() anyways.
+ */
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long sl = ALIGN(s,BITS_PER_LONG);
+ unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+ int first_page;
+ int last_page;
+ int page_nr;
+ int first_word;
+ int last_word;
+
+ if (e - s <= 3*BITS_PER_LONG) {
+ /* don't bother; el and sl may even be wrong. */
+ spin_lock_irq(&b->bm_lock);
+ __bm_change_bits_to(device, s, e, 1);
+ spin_unlock_irq(&b->bm_lock);
+ return;
+ }
+
+ /* difference is large enough that we can trust sl and el */
+
+ spin_lock_irq(&b->bm_lock);
+
+ /* bits filling the current long */
+ if (sl)
+ __bm_change_bits_to(device, s, sl-1, 1);
+
+ first_page = sl >> (3 + PAGE_SHIFT);
+ last_page = el >> (3 + PAGE_SHIFT);
+
+ /* MLPP: modulo longs per page */
+ /* LWPP: long words per page */
+ first_word = MLPP(sl >> LN2_BPL);
+ last_word = LWPP;
+
+ /* first and full pages, unless first page == last page */
+ for (page_nr = first_page; page_nr < last_page; page_nr++) {
+ bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
+ spin_unlock_irq(&b->bm_lock);
+ cond_resched();
+ first_word = 0;
+ spin_lock_irq(&b->bm_lock);
+ }
+ /* last page (respectively only page, for first page == last page) */
+ last_word = MLPP(el >> LN2_BPL);
+
+ /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
+ * ==> e = 32767, el = 32768, last_page = 2,
+ * and now last_word = 0.
+ * We do not want to touch last_page in this case,
+ * as we did not allocate it, it is not present in bitmap->bm_pages.
+ */
+ if (last_word)
+ bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
+
+ /* possibly trailing bits.
+ * example: (e & 63) == 63, el will be e+1.
+ * if that even was the very last bit,
+ * it would trigger an assert in __bm_change_bits_to()
+ */
+ if (el <= e)
+ __bm_change_bits_to(device, el, e, 1);
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ * 1 ... bit set
+ * 0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr;
+ int i;
+
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
+ if (bitnr < b->bm_bits) {
+ p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
+ i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
+ bm_unmap(p_addr);
+ } else if (bitnr == b->bm_bits) {
+ i = -1;
+ } else { /* (bitnr > b->bm_bits) */
+ drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+ i = 0;
+ }
+
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr = NULL;
+ unsigned long bitnr;
+ unsigned int page_nr = -1U;
+ int c = 0;
+
+ /* If this is called without a bitmap, that is a bug. But just to be
+ * robust in case we screwed up elsewhere, in that case pretend there
+ * was one dirty bit in the requested area, so we won't try to do a
+ * local read there (no bitmap probably implies no disk) */
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 1;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
+ for (bitnr = s; bitnr <= e; bitnr++) {
+ unsigned int idx = bm_bit_to_page_idx(b, bitnr);
+ if (page_nr != idx) {
+ page_nr = idx;
+ if (p_addr)
+ bm_unmap(p_addr);
+ p_addr = bm_map_pidx(b, idx);
+ }
+ if (expect(bitnr < b->bm_bits))
+ c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+ else
+ drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+ }
+ if (p_addr)
+ bm_unmap(p_addr);
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ int count, s, e;
+ unsigned long flags;
+ unsigned long *p_addr, *bm;
+
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
+
+ s = S2W(enr);
+ e = min((size_t)S2W(enr+1), b->bm_words);
+ count = 0;
+ if (s < b->bm_words) {
+ int n = e-s;
+ p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
+ bm = p_addr + MLPP(s);
+ while (n--)
+ count += hweight_long(*bm++);
+ bm_unmap(p_addr);
+ } else {
+ drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+ }
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return count;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644
index 000000000..a6ee3d750
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -0,0 +1,958 @@
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+ if (valid)
+ seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+ else
+ seq_printf(m, "\t-");
+}
+
+static void __seq_print_rq_state_bit(struct seq_file *m,
+ bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+ if (is_set && set_name) {
+ seq_putc(m, *sep);
+ seq_puts(m, set_name);
+ *sep = '|';
+ } else if (!is_set && unset_name) {
+ seq_putc(m, *sep);
+ seq_puts(m, unset_name);
+ *sep = '|';
+ }
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+ bool is_set, char *sep, const char *set_name)
+{
+ __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+ unsigned int s = req->rq_state;
+ char sep = ' ';
+ seq_printf(m, "\t0x%08x", s);
+ seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+ /* RQ_WRITE ignored, already reported */
+ seq_puts(m, "\tlocal:");
+ seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+ seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+ seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+ if (sep == ' ')
+ seq_puts(m, " -");
+
+ /* for_each_connection ... */
+ seq_printf(m, "\tnet:");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+ seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+ seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+ seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+ seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+ seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+ if (sep == ' ')
+ seq_puts(m, " -");
+
+ seq_printf(m, " :");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+ seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+ seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+ if (sep == ' ')
+ seq_puts(m, " -");
+ seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+ /* change anything here, fixup header below! */
+ unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+ seq_printf(m, "0x%x\t%llu\t%u\t%s",
+ req->epoch,
+ (unsigned long long)req->i.sector, req->i.size >> 9,
+ (s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+ seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+ seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+ seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+ seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+ seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+ seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+ seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+ seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+ seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ struct drbd_md_io tmp;
+ /* In theory this is racy,
+ * in the sense that there could have been a
+ * drbd_md_put_buffer(); drbd_md_get_buffer();
+ * between accessing these members here. */
+ tmp = device->md_io;
+ if (atomic_read(&tmp.in_use)) {
+ seq_printf(m, "%u\t%u\t%d\t",
+ device->minor, device->vnr,
+ jiffies_to_msecs(now - tmp.start_jif));
+ if (time_before(tmp.submit_jif, tmp.start_jif))
+ seq_puts(m, "-\t");
+ else
+ seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+ seq_printf(m, "%s\n", tmp.current_use);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ unsigned long jif;
+ struct drbd_request *req;
+ int n = atomic_read(&device->ap_actlog_cnt);
+ if (n) {
+ spin_lock_irq(&device->resource->req_lock);
+ req = list_first_entry_or_null(&device->pending_master_completion[1],
+ struct drbd_request, req_pending_master_completion);
+ /* if the oldest request does not wait for the activity log
+ * it is not interesting for us here */
+ if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+ jif = req->start_jif;
+ else
+ req = NULL;
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+ if (n) {
+ seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+ if (req)
+ seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+ else
+ seq_puts(m, "-\t");
+ seq_printf(m, "%u\n", n);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+ struct drbd_bm_aio_ctx *ctx;
+ unsigned long start_jif;
+ unsigned int in_flight;
+ unsigned int flags;
+ spin_lock_irq(&device->resource->req_lock);
+ ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+ if (ctx && ctx->done)
+ ctx = NULL;
+ if (ctx) {
+ start_jif = ctx->start_jif;
+ in_flight = atomic_read(&ctx->in_flight);
+ flags = ctx->flags;
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ if (ctx) {
+ seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+ device->minor, device->vnr,
+ (flags & BM_AIO_READ) ? 'R' : 'W',
+ jiffies_to_msecs(now - start_jif),
+ in_flight);
+ }
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ seq_print_device_bitmap_io(m, device, now);
+ }
+ rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+ unsigned long f = peer_req->flags;
+ char sep = ' ';
+
+ __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+ __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+ seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+ seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+ seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+
+ if (f & EE_IS_TRIM) {
+ seq_putc(m, sep);
+ sep = '|';
+ if (f & EE_IS_TRIM_USE_ZEROOUT)
+ seq_puts(m, "zero-out");
+ else
+ seq_puts(m, "trim");
+ }
+ seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+ struct drbd_device *device, struct list_head *lh,
+ unsigned long now)
+{
+ bool reported_preparing = false;
+ struct drbd_peer_request *peer_req;
+ list_for_each_entry(peer_req, lh, w.list) {
+ if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+ continue;
+
+ if (device)
+ seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+ seq_printf(m, "%llu\t%u\t%c\t%u\t",
+ (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+ (peer_req->flags & EE_WRITE) ? 'W' : 'R',
+ jiffies_to_msecs(now - peer_req->submit_jif));
+ seq_print_peer_request_flags(m, peer_req);
+ if (peer_req->flags & EE_SUBMITTED)
+ break;
+ else
+ reported_preparing = true;
+ }
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+ struct drbd_device *device, unsigned long now)
+{
+ seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+ spin_lock_irq(&device->resource->req_lock);
+ seq_print_peer_request(m, device, &device->active_ee, now);
+ seq_print_peer_request(m, device, &device->read_ee, now);
+ seq_print_peer_request(m, device, &device->sync_ee, now);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (test_bit(FLUSH_PENDING, &device->flags)) {
+ seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+ device->minor, device->vnr,
+ jiffies_to_msecs(now - device->flush_jif));
+ }
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+ struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ seq_print_device_peer_requests(m, device, now);
+ }
+ rcu_read_unlock();
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+ struct drbd_resource *resource,
+ struct drbd_connection *connection,
+ unsigned long now)
+{
+ struct drbd_request *req;
+ unsigned int count = 0;
+ unsigned int show_state = 0;
+
+ seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+ spin_lock_irq(&resource->req_lock);
+ list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+ unsigned int tmp = 0;
+ unsigned int s;
+ ++count;
+
+ /* don't disable irq "forever" */
+ if (!(count & 0x1ff)) {
+ struct drbd_request *req_next;
+ kref_get(&req->kref);
+ spin_unlock_irq(&resource->req_lock);
+ cond_resched();
+ spin_lock_irq(&resource->req_lock);
+ req_next = list_next_entry(req, tl_requests);
+ if (kref_put(&req->kref, drbd_req_destroy))
+ req = req_next;
+ if (&req->tl_requests == &connection->transfer_log)
+ break;
+ }
+
+ s = req->rq_state;
+
+ /* This is meant to summarize timing issues, to be able to tell
+ * local disk problems from network problems.
+ * Skip requests, if we have shown an even older request with
+ * similar aspects already. */
+ if (req->master_bio == NULL)
+ tmp |= 1;
+ if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+ tmp |= 2;
+ if (s & RQ_NET_MASK) {
+ if (!(s & RQ_NET_SENT))
+ tmp |= 4;
+ if (s & RQ_NET_PENDING)
+ tmp |= 8;
+ if (!(s & RQ_NET_DONE))
+ tmp |= 16;
+ }
+ if ((tmp & show_state) == tmp)
+ continue;
+ show_state |= tmp;
+ seq_printf(m, "%u\t", count);
+ seq_print_minor_vnr_req(m, req, now);
+ if (show_state == 0x1f)
+ break;
+ }
+ spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+ struct drbd_resource *resource = m->private;
+ struct drbd_connection *connection;
+ unsigned long jif = jiffies;
+
+ connection = first_connection(resource);
+ /* This does not happen, actually.
+ * But be robust and prepare for future code changes. */
+ if (!connection || !kref_get_unless_zero(&connection->kref))
+ return -ESTALE;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ seq_puts(m, "oldest bitmap IO\n");
+ seq_print_resource_pending_bitmap_io(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "meta data IO\n");
+ seq_print_resource_pending_meta_io(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "socket buffer stats\n");
+ /* for each connection ... once we have more than one */
+ rcu_read_lock();
+ if (connection->data.socket) {
+ /* open coded SIOCINQ, the "relevant" part */
+ struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+ int answ = tp->rcv_nxt - tp->copied_seq;
+ seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+ /* open coded SIOCOUTQ, the "relevant" part */
+ answ = tp->write_seq - tp->snd_una;
+ seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+ }
+ rcu_read_unlock();
+ seq_putc(m, '\n');
+
+ seq_puts(m, "oldest peer requests\n");
+ seq_print_resource_pending_peer_requests(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "application requests waiting for activity log\n");
+ seq_print_waiting_for_AL(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "oldest application requests\n");
+ seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+ seq_putc(m, '\n');
+
+ jif = jiffies - jif;
+ if (jif)
+ seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return 0;
+}
+
+/* simple_positive(file->f_path.dentry) respectively debugfs_positive(),
+ * but neither is "reachable" from here.
+ * So we have our own inline version of it above. :-( */
+static inline int debugfs_positive(struct dentry *dentry)
+{
+ return d_really_is_positive(dentry) && !d_unhashed(dentry);
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+ void *data, struct kref *kref,
+ void (*release)(struct kref *))
+{
+ struct dentry *parent;
+ int ret = -ESTALE;
+
+ /* Are we still linked,
+ * or has debugfs_remove() already been called? */
+ parent = file->f_path.dentry->d_parent;
+ /* not sure if this can happen: */
+ if (!parent || d_really_is_negative(parent))
+ goto out;
+ /* serialize with d_delete() */
+ mutex_lock(&d_inode(parent)->i_mutex);
+ /* Make sure the object is still alive */
+ if (debugfs_positive(file->f_path.dentry)
+ && kref_get_unless_zero(kref))
+ ret = 0;
+ mutex_unlock(&d_inode(parent)->i_mutex);
+ if (!ret) {
+ ret = single_open(file, show, data);
+ if (ret)
+ kref_put(kref, release);
+ }
+out:
+ return ret;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+ struct drbd_resource *resource = inode->i_private;
+ return drbd_single_open(file, in_flight_summary_show, resource,
+ &resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+ struct drbd_resource *resource = inode->i_private;
+ kref_put(&resource->kref, drbd_destroy_resource);
+ return single_release(inode, file);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+ .owner = THIS_MODULE,
+ .open = in_flight_summary_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = in_flight_summary_release,
+};
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+ struct dentry *dentry;
+ if (!drbd_debugfs_resources)
+ return;
+
+ dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res = dentry;
+
+ dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_volumes = dentry;
+
+ dentry = debugfs_create_dir("connections", resource->debugfs_res);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_connections = dentry;
+
+ dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
+ resource->debugfs_res, resource,
+ &in_flight_summary_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_in_flight_summary = dentry;
+ return;
+
+fail:
+ drbd_debugfs_resource_cleanup(resource);
+ drbd_err(resource, "failed to create debugfs dentry\n");
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+ debugfs_remove(*dp);
+ *dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+ /* it is ok to call debugfs_remove(NULL) */
+ drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+ drbd_debugfs_remove(&resource->debugfs_res_connections);
+ drbd_debugfs_remove(&resource->debugfs_res_volumes);
+ drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+static void seq_print_one_timing_detail(struct seq_file *m,
+ const struct drbd_thread_timing_details *tdp,
+ unsigned long now)
+{
+ struct drbd_thread_timing_details td;
+ /* No locking...
+ * use temporary assignment to get at consistent data. */
+ do {
+ td = *tdp;
+ } while (td.cb_nr != tdp->cb_nr);
+ if (!td.cb_addr)
+ return;
+ seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+ td.cb_nr,
+ jiffies_to_msecs(now - td.start_jif),
+ td.caller_fn, td.line,
+ td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+ const char *title,
+ unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+ unsigned int start_idx;
+ unsigned int i;
+
+ seq_printf(m, "%s\n", title);
+ /* If not much is going on, this will result in natural ordering.
+ * If it is very busy, we will possibly skip events, or even see wrap
+ * arounds, which could only be avoided with locking.
+ */
+ start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+ for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+ seq_print_one_timing_detail(m, tdp+i, now);
+ for (i = 0; i < start_idx; i++)
+ seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_connection *connection = m->private;
+ unsigned long jif = jiffies;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ seq_puts(m, "n\tage\tcallsite\tfn\n");
+ seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+ seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+ return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ return drbd_single_open(file, callback_history_show, connection,
+ &connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+ .owner = THIS_MODULE,
+ .open = callback_history_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = callback_history_release,
+};
+
+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_connection *connection = m->private;
+ unsigned long now = jiffies;
+ struct drbd_request *r1, *r2;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ r1 = connection->req_next;
+ if (r1)
+ seq_print_minor_vnr_req(m, r1, now);
+ r2 = connection->req_ack_pending;
+ if (r2 && r2 != r1) {
+ r1 = r2;
+ seq_print_minor_vnr_req(m, r1, now);
+ }
+ r2 = connection->req_not_net_done;
+ if (r2 && r2 != r1)
+ seq_print_minor_vnr_req(m, r2, now);
+ spin_unlock_irq(&connection->resource->req_lock);
+ return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ return drbd_single_open(file, connection_oldest_requests_show, connection,
+ &connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+ .owner = THIS_MODULE,
+ .open = connection_oldest_requests_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = connection_oldest_requests_release,
+};
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+ struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+ struct dentry *dentry;
+ if (!conns_dir)
+ return;
+
+ /* Once we enable mutliple peers,
+ * these connections will have descriptive names.
+ * For now, it is just the one connection to the (only) "peer". */
+ dentry = debugfs_create_dir("peer", conns_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn = dentry;
+
+ dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
+ connection->debugfs_conn, connection,
+ &connection_callback_history_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn_callback_history = dentry;
+
+ dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
+ connection->debugfs_conn, connection,
+ &connection_oldest_requests_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn_oldest_requests = dentry;
+ return;
+
+fail:
+ drbd_debugfs_connection_cleanup(connection);
+ drbd_err(connection, "failed to create debugfs dentry\n");
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+ drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+ drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+ drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+ struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+ seq_printf(m, "%5d %s %s %s", bme->rs_left,
+ test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+ test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+ test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+ );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(m, device->resync);
+ lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+ put_ldev(device);
+ }
+ return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(m, device->act_log);
+ lc_seq_dump_details(m, device->act_log, "", NULL);
+ put_ldev(device);
+ }
+ return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+ struct drbd_resource *resource = device->resource;
+ unsigned long now = jiffies;
+ struct drbd_request *r1, *r2;
+ int i;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ seq_puts(m, RQ_HDR);
+ spin_lock_irq(&resource->req_lock);
+ /* WRITE, then READ */
+ for (i = 1; i >= 0; --i) {
+ r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+ struct drbd_request, req_pending_master_completion);
+ r2 = list_first_entry_or_null(&device->pending_completion[i],
+ struct drbd_request, req_pending_local);
+ if (r1)
+ seq_print_one_request(m, r1, now);
+ if (r2 && r2 != r1)
+ seq_print_one_request(m, r2, now);
+ }
+ spin_unlock_irq(&resource->req_lock);
+ return 0;
+}
+
+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+ struct drbd_md *md;
+ enum drbd_uuid_index idx;
+
+ if (!get_ldev_if_state(device, D_FAILED))
+ return -ENODEV;
+
+ md = &device->ldev->md;
+ spin_lock_irq(&md->uuid_lock);
+ for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+ seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+ }
+ spin_unlock_irq(&md->uuid_lock);
+ put_ldev(device);
+ return 0;
+}
+
+#define drbd_debugfs_device_attr(name) \
+static int device_ ## name ## _open(struct inode *inode, struct file *file) \
+{ \
+ struct drbd_device *device = inode->i_private; \
+ return drbd_single_open(file, device_ ## name ## _show, device, \
+ &device->kref, drbd_destroy_device); \
+} \
+static int device_ ## name ## _release(struct inode *inode, struct file *file) \
+{ \
+ struct drbd_device *device = inode->i_private; \
+ kref_put(&device->kref, drbd_destroy_device); \
+ return single_release(inode, file); \
+} \
+static const struct file_operations device_ ## name ## _fops = { \
+ .owner = THIS_MODULE, \
+ .open = device_ ## name ## _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = device_ ## name ## _release, \
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+ struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+ char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+ char vnr_buf[8]; /* volume number vnr is even 16 bit only; */
+ char *slink_name = NULL;
+
+ struct dentry *dentry;
+ if (!vols_dir || !drbd_debugfs_minors)
+ return;
+
+ snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+ dentry = debugfs_create_dir(vnr_buf, vols_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ device->debugfs_vol = dentry;
+
+ snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+ slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+ device->resource->name, device->vnr);
+ if (!slink_name)
+ goto fail;
+ dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+ kfree(slink_name);
+ slink_name = NULL;
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ device->debugfs_minor = dentry;
+
+#define DCF(name) do { \
+ dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \
+ device->debugfs_vol, device, \
+ &device_ ## name ## _fops); \
+ if (IS_ERR_OR_NULL(dentry)) \
+ goto fail; \
+ device->debugfs_vol_ ## name = dentry; \
+ } while (0)
+
+ DCF(oldest_requests);
+ DCF(act_log_extents);
+ DCF(resync_extents);
+ DCF(data_gen_id);
+#undef DCF
+ return;
+
+fail:
+ drbd_debugfs_device_cleanup(device);
+ drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+ drbd_debugfs_remove(&device->debugfs_minor);
+ drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+ drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+ drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+ drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+ drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+ struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+ struct dentry *dentry;
+ char vnr_buf[8];
+
+ if (!conn_dir)
+ return;
+
+ snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+ dentry = debugfs_create_dir(vnr_buf, conn_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ peer_device->debugfs_peer_dev = dentry;
+ return;
+
+fail:
+ drbd_debugfs_peer_device_cleanup(peer_device);
+ drbd_err(peer_device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+ drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+ seq_printf(m, "# %s\n", drbd_buildtag());
+ seq_printf(m, "VERSION=%s\n", REL_VERSION);
+ seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+ seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+ seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+ return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, drbd_version_show, NULL);
+}
+
+static struct file_operations drbd_version_fops = {
+ .owner = THIS_MODULE,
+ .open = drbd_version_open,
+ .llseek = seq_lseek,
+ .read = seq_read,
+ .release = single_release,
+};
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+ drbd_debugfs_remove(&drbd_debugfs_resources);
+ drbd_debugfs_remove(&drbd_debugfs_minors);
+ drbd_debugfs_remove(&drbd_debugfs_version);
+ drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+int __init drbd_debugfs_init(void)
+{
+ struct dentry *dentry;
+
+ dentry = debugfs_create_dir("drbd", NULL);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_root = dentry;
+
+ dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_version = dentry;
+
+ dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_resources = dentry;
+
+ dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_minors = dentry;
+ return 0;
+
+fail:
+ drbd_debugfs_cleanup();
+ if (dentry)
+ return PTR_ERR(dentry);
+ else
+ return -EINVAL;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644
index 000000000..8bee21340
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+int __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000..b905e9888
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2328 @@
+/*
+ drbd_int.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/ratelimit.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+#include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+#include "drbd_state.h"
+#include "drbd_protocol.h"
+
+#ifdef __CHECKER__
+# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+# define __must_hold(x)
+#endif
+
+/* module parameter, defined in drbd_main.c */
+extern unsigned int minor_count;
+extern bool disable_sendpage;
+extern bool allow_oos;
+void tl_abort_disk_io(struct drbd_device *device);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern int enable_faults;
+extern int fault_rate;
+extern int fault_devs;
+#endif
+
+extern char usermode_helper[];
+
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+#define ID_IN_SYNC (4711ULL)
+#define ID_OUT_OF_SYNC (4712ULL)
+#define ID_SYNCER (-1ULL)
+
+#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
+
+struct drbd_device;
+struct drbd_connection;
+
+#define __drbd_printk_device(level, device, fmt, args...) \
+ dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
+#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
+ dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
+#define __drbd_printk_resource(level, resource, fmt, args...) \
+ printk(level "drbd %s: " fmt, (resource)->name, ## args)
+#define __drbd_printk_connection(level, connection, fmt, args...) \
+ printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
+
+void drbd_printk_with_wrong_object_type(void);
+
+#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
+ (__builtin_types_compatible_p(typeof(obj), type) || \
+ __builtin_types_compatible_p(typeof(obj), const type)), \
+ func(level, (const type)(obj), fmt, ## args)
+
+#define drbd_printk(level, obj, fmt, args...) \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_device *, \
+ __drbd_printk_device, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_resource *, \
+ __drbd_printk_resource, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_connection *, \
+ __drbd_printk_connection, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
+ __drbd_printk_peer_device, level, fmt, ## args), \
+ drbd_printk_with_wrong_object_type()))))
+
+#define drbd_dbg(obj, fmt, args...) \
+ drbd_printk(KERN_DEBUG, obj, fmt, ## args)
+#define drbd_alert(obj, fmt, args...) \
+ drbd_printk(KERN_ALERT, obj, fmt, ## args)
+#define drbd_err(obj, fmt, args...) \
+ drbd_printk(KERN_ERR, obj, fmt, ## args)
+#define drbd_warn(obj, fmt, args...) \
+ drbd_printk(KERN_WARNING, obj, fmt, ## args)
+#define drbd_info(obj, fmt, args...) \
+ drbd_printk(KERN_INFO, obj, fmt, ## args)
+#define drbd_emerg(obj, fmt, args...) \
+ drbd_printk(KERN_EMERG, obj, fmt, ## args)
+
+#define dynamic_drbd_dbg(device, fmt, args...) \
+ dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
+
+#define D_ASSERT(device, exp) do { \
+ if (!(exp)) \
+ drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
+ } while (0)
+
+/**
+ * expect - Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({ \
+ bool _bool = (exp); \
+ if (!_bool) \
+ drbd_err(device, "ASSERTION %s FAILED in %s\n", \
+ #exp, __func__); \
+ _bool; \
+ })
+
+/* Defines to control fault insertion */
+enum {
+ DRBD_FAULT_MD_WR = 0, /* meta data write */
+ DRBD_FAULT_MD_RD = 1, /* read */
+ DRBD_FAULT_RS_WR = 2, /* resync */
+ DRBD_FAULT_RS_RD = 3,
+ DRBD_FAULT_DT_WR = 4, /* data */
+ DRBD_FAULT_DT_RD = 5,
+ DRBD_FAULT_DT_RA = 6, /* data read ahead */
+ DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
+ DRBD_FAULT_AL_EE = 8, /* alloc ee */
+ DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
+
+ DRBD_FAULT_MAX,
+};
+
+extern unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type);
+
+static inline int
+drbd_insert_fault(struct drbd_device *device, unsigned int type) {
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+ return fault_rate &&
+ (enable_faults & (1<<type)) &&
+ _drbd_insert_fault(device, type);
+#else
+ return 0;
+#endif
+}
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr drbd_devices; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_resources; /* RCU, updates: genl_lock() */
+
+extern const char *cmdname(enum drbd_packet cmd);
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+ /* "const"
+ * stores total bits and long words
+ * of the bitmap, so we don't need to
+ * call the accessor functions over and again. */
+ unsigned long bm_bits;
+ unsigned long bm_words;
+ /* during xfer, current position within the bitmap */
+ unsigned long bit_offset;
+ unsigned long word_offset;
+
+ /* statistics; index: (h->command == P_BITMAP) */
+ unsigned packets[2];
+ unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_device *device,
+ const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+ /* word_offset counts "native long words" (32 or 64 bit),
+ * aligned at 64 bit.
+ * Encoded packet may end at an unaligned bit offset.
+ * In case a fallback clear text packet is transmitted in
+ * between, we adjust this offset back to the last 64bit
+ * aligned "native long word", which makes coding and decoding
+ * the plain text bitmap much more convenient. */
+#if BITS_PER_LONG == 64
+ c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+ c->word_offset = c->bit_offset >> 5;
+ c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+extern unsigned int drbd_header_size(struct drbd_connection *connection);
+
+/**********************************************************************/
+enum drbd_thread_state {
+ NONE,
+ RUNNING,
+ EXITING,
+ RESTARTING
+};
+
+struct drbd_thread {
+ spinlock_t t_lock;
+ struct task_struct *task;
+ struct completion stop;
+ enum drbd_thread_state t_state;
+ int (*function) (struct drbd_thread *);
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ int reset_cpu_mask;
+ const char *name;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+ /* THINK testing the t_state seems to be uncritical in all cases
+ * (but thread_{start,stop}), so we can read it *without* the lock.
+ * --lge */
+
+ smp_rmb();
+ return thi->t_state;
+}
+
+struct drbd_work {
+ struct list_head list;
+ int (*cb)(struct drbd_work *, int cancel);
+};
+
+struct drbd_device_work {
+ struct drbd_work w;
+ struct drbd_device *device;
+};
+
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
+
+struct drbd_request {
+ struct drbd_work w;
+ struct drbd_device *device;
+
+ /* if local IO is not allowed, will be NULL.
+ * if local IO _is_ allowed, holds the locally submitted bio clone,
+ * or, after local IO completion, the ERR_PTR(error).
+ * see drbd_request_endio(). */
+ struct bio *private_bio;
+
+ struct drbd_interval i;
+
+ /* epoch: used to check on "completion" whether this req was in
+ * the current epoch, and we therefore have to close it,
+ * causing a p_barrier packet to be send, starting a new epoch.
+ *
+ * This corresponds to "barrier" in struct p_barrier[_ack],
+ * and to "barrier_nr" in struct drbd_epoch (and various
+ * comments/function parameters/local variable names).
+ */
+ unsigned int epoch;
+
+ struct list_head tl_requests; /* ring list in the transfer log */
+ struct bio *master_bio; /* master bio pointer */
+
+ /* see struct drbd_device */
+ struct list_head req_pending_master_completion;
+ struct list_head req_pending_local;
+
+ /* for generic IO accounting */
+ unsigned long start_jif;
+
+ /* for DRBD internal statistics */
+
+ /* Minimal set of time stamps to determine if we wait for activity log
+ * transactions, local disk or peer. 32 bit "jiffies" are good enough,
+ * we don't expect a DRBD request to be stalled for several month.
+ */
+
+ /* before actual request processing */
+ unsigned long in_actlog_jif;
+
+ /* local disk */
+ unsigned long pre_submit_jif;
+
+ /* per connection */
+ unsigned long pre_send_jif;
+ unsigned long acked_jif;
+ unsigned long net_done_jif;
+
+ /* Possibly even more detail to track each phase:
+ * master_completion_jif
+ * how long did it take to complete the master bio
+ * (application visible latency)
+ * allocated_jif
+ * how long the master bio was blocked until we finally allocated
+ * a tracking struct
+ * in_actlog_jif
+ * how long did we wait for activity log transactions
+ *
+ * net_queued_jif
+ * when did we finally queue it for sending
+ * pre_send_jif
+ * when did we start sending it
+ * post_send_jif
+ * how long did we block in the network stack trying to send it
+ * acked_jif
+ * when did we receive (or fake, in protocol A) a remote ACK
+ * net_done_jif
+ * when did we receive final acknowledgement (P_BARRIER_ACK),
+ * or decide, e.g. on connection loss, that we do no longer expect
+ * anything from this peer for this request.
+ *
+ * pre_submit_jif
+ * post_sub_jif
+ * when did we start submiting to the lower level device,
+ * and how long did we block in that submit function
+ * local_completion_jif
+ * how long did it take the lower level device to complete this request
+ */
+
+
+ /* once it hits 0, we may complete the master_bio */
+ atomic_t completion_ref;
+ /* once it hits 0, we may destroy this drbd_request object */
+ struct kref kref;
+
+ unsigned rq_state; /* see comments above _req_mod() */
+};
+
+struct drbd_epoch {
+ struct drbd_connection *connection;
+ struct list_head list;
+ unsigned int barrier_nr;
+ atomic_t epoch_size; /* increased on every request added. */
+ atomic_t active; /* increased on every req. added, and dec on every finished. */
+ unsigned long flags;
+};
+
+/* Prototype declaration of function defined in drbd_receiver.c */
+int drbdd_init(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+/* drbd_epoch flag bits */
+enum {
+ DE_HAVE_BARRIER_NUMBER,
+};
+
+enum epoch_event {
+ EV_PUT,
+ EV_GOT_BARRIER_NR,
+ EV_BECAME_LAST,
+ EV_CLEANUP = 32, /* used as flag */
+};
+
+struct digest_info {
+ int digest_size;
+ void *digest;
+};
+
+struct drbd_peer_request {
+ struct drbd_work w;
+ struct drbd_peer_device *peer_device;
+ struct drbd_epoch *epoch; /* for writes */
+ struct page *pages;
+ atomic_t pending_bios;
+ struct drbd_interval i;
+ /* see comments on ee flag bits below */
+ unsigned long flags;
+ unsigned long submit_jif;
+ union {
+ u64 block_id;
+ struct digest_info *digest;
+ };
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
+enum {
+ __EE_CALL_AL_COMPLETE_IO,
+ __EE_MAY_SET_IN_SYNC,
+
+ /* is this a TRIM aka REQ_DISCARD? */
+ __EE_IS_TRIM,
+ /* our lower level cannot handle trim,
+ * and we want to fall back to zeroout instead */
+ __EE_IS_TRIM_USE_ZEROOUT,
+
+ /* In case a barrier failed,
+ * we need to resubmit without the barrier flag. */
+ __EE_RESUBMITTED,
+
+ /* we may have several bios per peer request.
+ * if any of those fail, we set this flag atomically
+ * from the endio callback */
+ __EE_WAS_ERROR,
+
+ /* This ee has a pointer to a digest instead of a block id */
+ __EE_HAS_DIGEST,
+
+ /* Conflicting local requests need to be restarted after this request */
+ __EE_RESTART_REQUESTS,
+
+ /* The peer wants a write ACK for this (wire proto C) */
+ __EE_SEND_WRITE_ACK,
+
+ /* Is set when net_conf had two_primaries set while creating this peer_req */
+ __EE_IN_INTERVAL_TREE,
+
+ /* for debugfs: */
+ /* has this been submitted, or does it still wait for something else? */
+ __EE_SUBMITTED,
+
+ /* this is/was a write request */
+ __EE_WRITE,
+
+ /* this originates from application on peer
+ * (not some resync or verify or other DRBD internal request) */
+ __EE_APPLICATION,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_TRIM (1<<__EE_IS_TRIM)
+#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
+#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED (1<<__EE_SUBMITTED)
+#define EE_WRITE (1<<__EE_WRITE)
+#define EE_APPLICATION (1<<__EE_APPLICATION)
+
+/* flag bits per device */
+enum {
+ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
+ MD_DIRTY, /* current uuids and flags not yet on disk */
+ USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
+ CL_ST_CHG_SUCCESS,
+ CL_ST_CHG_FAIL,
+ CRASHED_PRIMARY, /* This node was a crashed primary.
+ * Gets cleared when the state.conn
+ * goes into C_CONNECTED state. */
+ CONSIDER_RESYNC,
+
+ MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
+
+ SUSPEND_IO, /* suspend application io */
+ BITMAP_IO, /* suspend application io;
+ once no more io in flight, start bitmap io */
+ BITMAP_IO_QUEUED, /* Started bitmap IO */
+ WAS_IO_ERROR, /* Local disk failed, returned IO error */
+ WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
+ FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
+ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
+ RESIZE_PENDING, /* Size change detected locally, waiting for the response from
+ * the peer, if it changed there as well. */
+ NEW_CUR_UUID, /* Create new current UUID when thawing IO */
+ AL_SUSPENDED, /* Activity logging is currently suspended. */
+ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
+ B_RS_H_DONE, /* Before resync handler done (already executed) */
+ DISCARD_MY_DATA, /* discard_my_data flag per volume */
+ READ_BALANCE_RR,
+
+ FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush
+ * from drbd_flush_after_epoch() */
+
+ /* cleared only after backing device related structures have been destroyed. */
+ GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */
+
+ /* to be used in drbd_device_post_work() */
+ GO_DISKLESS, /* tell worker to schedule cleanup before detach */
+ DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */
+ MD_SYNC, /* tell worker to call drbd_md_sync() */
+ RS_START, /* tell worker to start resync/OV */
+ RS_PROGRESS, /* tell worker that resync made significant progress */
+ RS_DONE, /* tell worker that resync is done */
+};
+
+struct drbd_bitmap; /* opaque for drbd_device */
+
+/* definition of bits in bm_flags to be used in drbd_bm_lock
+ * and drbd_bitmap_io and friends. */
+enum bm_flag {
+ /* do we need to kfree, or vfree bm_pages? */
+ BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
+
+ /* currently locked for bulk operation */
+ BM_LOCKED_MASK = 0xf,
+
+ /* in detail, that is: */
+ BM_DONT_CLEAR = 0x1,
+ BM_DONT_SET = 0x2,
+ BM_DONT_TEST = 0x4,
+
+ /* so we can mark it locked for bulk operation,
+ * and still allow all non-bulk operations */
+ BM_IS_LOCKED = 0x8,
+
+ /* (test bit, count bit) allowed (common case) */
+ BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
+
+ /* testing bits, as well as setting new bits allowed, but clearing bits
+ * would be unexpected. Used during bitmap receive. Setting new bits
+ * requires sending of "out-of-sync" information, though. */
+ BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
+
+ /* for drbd_bm_write_copy_pages, everything is allowed,
+ * only concurrent bulk operations are locked out. */
+ BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
+};
+
+struct drbd_work_queue {
+ struct list_head q;
+ spinlock_t q_lock; /* to protect the list. */
+ wait_queue_head_t q_wait;
+};
+
+struct drbd_socket {
+ struct mutex mutex;
+ struct socket *socket;
+ /* this way we get our
+ * send/receive buffers off the stack */
+ void *sbuf;
+ void *rbuf;
+};
+
+struct drbd_md {
+ u64 md_offset; /* sector offset to 'super' block */
+
+ u64 la_size_sect; /* last agreed size, unit sectors */
+ spinlock_t uuid_lock;
+ u64 uuid[UI_SIZE];
+ u64 device_uuid;
+ u32 flags;
+ u32 md_size_sect;
+
+ s32 al_offset; /* signed relative sector offset to activity log */
+ s32 bm_offset; /* signed relative sector offset to bitmap */
+
+ /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+ s32 meta_dev_idx;
+
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+ u32 al_size_4k; /* cached product of the above */
+};
+
+struct drbd_backing_dev {
+ struct block_device *backing_bdev;
+ struct block_device *md_bdev;
+ struct drbd_md md;
+ struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
+ sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+ struct page *page;
+ unsigned long start_jif; /* last call to drbd_md_get_buffer */
+ unsigned long submit_jif; /* last _drbd_md_sync_page_io() submit */
+ const char *current_use;
+ atomic_t in_use;
+ unsigned int done;
+ int error;
+};
+
+struct bm_io_work {
+ struct drbd_work w;
+ char *why;
+ enum bm_flag flags;
+ int (*io_fn)(struct drbd_device *device);
+ void (*done)(struct drbd_device *device, int rv);
+};
+
+enum write_ordering_e {
+ WO_none,
+ WO_drain_io,
+ WO_bdev_flush,
+};
+
+struct fifo_buffer {
+ unsigned int head_index;
+ unsigned int size;
+ int total; /* sum of all values */
+ int values[0];
+};
+extern struct fifo_buffer *fifo_alloc(int fifo_size);
+
+/* flag bits per connection */
+enum {
+ NET_CONGESTED, /* The data socket is congested */
+ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
+ SEND_PING, /* whether asender should send a ping asap */
+ SIGNAL_ASENDER, /* whether asender wants to be interrupted */
+ GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
+ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
+ CONN_WD_ST_CHG_OKAY,
+ CONN_WD_ST_CHG_FAIL,
+ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
+ CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
+ STATE_SENT, /* Do not change state/UUIDs while this is set */
+ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+ * pending, from drbd worker context.
+ * If set, bdi_write_congested() returns true,
+ * so shrink_page_list() would not recurse into,
+ * and potentially deadlock on, this drbd worker.
+ */
+ DISCONNECT_SENT,
+
+ DEVICE_WORK_PENDING, /* tell worker that some device has pending work */
+};
+
+struct drbd_resource {
+ char *name;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_res;
+ struct dentry *debugfs_res_volumes;
+ struct dentry *debugfs_res_connections;
+ struct dentry *debugfs_res_in_flight_summary;
+#endif
+ struct kref kref;
+ struct idr devices; /* volume number to device mapping */
+ struct list_head connections;
+ struct list_head resources;
+ struct res_opts res_opts;
+ struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */
+ struct mutex adm_mutex; /* mutex to serialize administrative requests */
+ spinlock_t req_lock;
+
+ unsigned susp:1; /* IO suspended by user */
+ unsigned susp_nod:1; /* IO suspended because no data */
+ unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
+
+ enum write_ordering_e write_ordering;
+
+ cpumask_var_t cpu_mask;
+};
+
+struct drbd_thread_timing_details
+{
+ unsigned long start_jif;
+ void *cb_addr;
+ const char *caller_fn;
+ unsigned int line;
+ unsigned int cb_nr;
+};
+
+struct drbd_connection {
+ struct list_head connections;
+ struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_conn;
+ struct dentry *debugfs_conn_callback_history;
+ struct dentry *debugfs_conn_oldest_requests;
+#endif
+ struct kref kref;
+ struct idr peer_devices; /* volume number to peer device mapping */
+ enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+ struct mutex cstate_mutex; /* Protects graceful disconnects */
+ unsigned int connect_cnt; /* Inc each time a connection is established */
+
+ unsigned long flags;
+ struct net_conf *net_conf; /* content protected by rcu */
+ wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */
+
+ struct sockaddr_storage my_addr;
+ int my_addr_len;
+ struct sockaddr_storage peer_addr;
+ int peer_addr_len;
+
+ struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+ struct drbd_socket meta; /* ping/ack (metadata) packets */
+ int agreed_pro_version; /* actually used protocol version */
+ u32 agreed_features;
+ unsigned long last_received; /* in jiffies, either socket */
+ unsigned int ko_count;
+
+ struct list_head transfer_log; /* all requests not yet fully processed */
+
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */
+ struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *verify_tfm;
+ void *int_dig_in;
+ void *int_dig_vv;
+
+ /* receiver side */
+ struct drbd_epoch *current_epoch;
+ spinlock_t epoch_lock;
+ unsigned int epochs;
+ atomic_t current_tle_nr; /* transfer log epoch number */
+ unsigned current_tle_writes; /* writes seen within this tl epoch */
+
+ unsigned long last_reconnect_jif;
+ struct drbd_thread receiver;
+ struct drbd_thread worker;
+ struct drbd_thread asender;
+
+ /* cached pointers,
+ * so we can look up the oldest pending requests more quickly.
+ * protected by resource->req_lock */
+ struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+ struct drbd_request *req_ack_pending;
+ struct drbd_request *req_not_net_done;
+
+ /* sender side */
+ struct drbd_work_queue sender_work;
+
+#define DRBD_THREAD_DETAILS_HIST 16
+ unsigned int w_cb_nr; /* keeps counting up */
+ unsigned int r_cb_nr; /* keeps counting up */
+ struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+ struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
+ struct {
+ /* whether this sender thread
+ * has processed a single write yet. */
+ bool seen_any_write_yet;
+
+ /* Which barrier number to send with the next P_BARRIER */
+ int current_epoch_nr;
+
+ /* how many write requests have been sent
+ * with req->epoch == current_epoch_nr.
+ * If none, no P_BARRIER will be sent. */
+ unsigned current_epoch_writes;
+ } send;
+};
+
+void __update_timing_details(
+ struct drbd_thread_timing_details *tdp,
+ unsigned int *cb_nr,
+ void *cb,
+ const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+ __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+ __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
+struct submit_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ /* protected by ..->resource->req_lock */
+ struct list_head writes;
+};
+
+struct drbd_peer_device {
+ struct list_head peer_devices;
+ struct drbd_device *device;
+ struct drbd_connection *connection;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_peer_dev;
+#endif
+};
+
+struct drbd_device {
+ struct drbd_resource *resource;
+ struct list_head peer_devices;
+ struct list_head pending_bitmap_io;
+
+ unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_minor;
+ struct dentry *debugfs_vol;
+ struct dentry *debugfs_vol_oldest_requests;
+ struct dentry *debugfs_vol_act_log_extents;
+ struct dentry *debugfs_vol_resync_extents;
+ struct dentry *debugfs_vol_data_gen_id;
+#endif
+
+ unsigned int vnr; /* volume number within the connection */
+ unsigned int minor; /* device minor number */
+
+ struct kref kref;
+
+ /* things that are stored as / read from meta data on disk */
+ unsigned long flags;
+
+ /* configured by drbdsetup */
+ struct drbd_backing_dev *ldev __protected_by(local);
+
+ sector_t p_size; /* partner's disk size */
+ struct request_queue *rq_queue;
+ struct block_device *this_bdev;
+ struct gendisk *vdisk;
+
+ unsigned long last_reattach_jif;
+ struct drbd_work resync_work;
+ struct drbd_work unplug_work;
+ struct timer_list resync_timer;
+ struct timer_list md_sync_timer;
+ struct timer_list start_resync_timer;
+ struct timer_list request_timer;
+
+ /* Used after attach while negotiating new disk state. */
+ union drbd_state new_state_tmp;
+
+ union drbd_dev_state state;
+ wait_queue_head_t misc_wait;
+ wait_queue_head_t state_wait; /* upon each state change. */
+ unsigned int send_cnt;
+ unsigned int recv_cnt;
+ unsigned int read_cnt;
+ unsigned int writ_cnt;
+ unsigned int al_writ_cnt;
+ unsigned int bm_writ_cnt;
+ atomic_t ap_bio_cnt; /* Requests we need to complete */
+ atomic_t ap_actlog_cnt; /* Requests waiting for activity log */
+ atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+ atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+ atomic_t unacked_cnt; /* Need to send replies for */
+ atomic_t local_cnt; /* Waiting for local completion */
+
+ /* Interval tree of pending local requests */
+ struct rb_root read_requests;
+ struct rb_root write_requests;
+
+ /* for statistics and timeouts */
+ /* [0] read, [1] write */
+ struct list_head pending_master_completion[2];
+ struct list_head pending_completion[2];
+
+ /* use checksums for *this* resync */
+ bool use_csums;
+ /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
+ unsigned long rs_total;
+ /* number of resync blocks that failed in this run */
+ unsigned long rs_failed;
+ /* Syncer's start time [unit jiffies] */
+ unsigned long rs_start;
+ /* cumulated time in PausedSyncX state [unit jiffies] */
+ unsigned long rs_paused;
+ /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+ unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
+ /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+ unsigned long rs_mark_left[DRBD_SYNC_MARKS];
+ /* marks's time [unit jiffies] */
+ unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+ /* current index into rs_mark_{left,time} */
+ int rs_last_mark;
+ unsigned long rs_last_bcast; /* [unit jiffies] */
+
+ /* where does the admin want us to start? (sector) */
+ sector_t ov_start_sector;
+ sector_t ov_stop_sector;
+ /* where are we now? (sector) */
+ sector_t ov_position;
+ /* Start sector of out of sync range (to merge printk reporting). */
+ sector_t ov_last_oos_start;
+ /* size of out-of-sync range in sectors. */
+ sector_t ov_last_oos_size;
+ unsigned long ov_left; /* in bits */
+
+ struct drbd_bitmap *bitmap;
+ unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+ /* Used to track operations of resync... */
+ struct lru_cache *resync;
+ /* Number of locked elements in resync LRU */
+ unsigned int resync_locked;
+ /* resync extent number waiting for application requests */
+ unsigned int resync_wenr;
+
+ int open_cnt;
+ u64 *p_uuid;
+
+ struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+ struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
+ struct list_head done_ee; /* need to send P_WRITE_ACK */
+ struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
+ struct list_head net_ee; /* zero-copy network send in progress */
+
+ int next_barrier_nr;
+ struct list_head resync_reads;
+ atomic_t pp_in_use; /* allocated from page pool */
+ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
+ wait_queue_head_t ee_wait;
+ struct drbd_md_io md_io;
+ spinlock_t al_lock;
+ wait_queue_head_t al_wait;
+ struct lru_cache *act_log; /* activity log */
+ unsigned int al_tr_number;
+ int al_tr_cycle;
+ wait_queue_head_t seq_wait;
+ atomic_t packet_seq;
+ unsigned int peer_seq;
+ spinlock_t peer_seq_lock;
+ unsigned long comm_bm_set; /* communicated number of set bits. */
+ struct bm_io_work bm_io_work;
+ u64 ed_uuid; /* UUID of the exposed data */
+ struct mutex own_state_mutex;
+ struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */
+ char congestion_reason; /* Why we where congested... */
+ atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+ atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+ int rs_last_sect_ev; /* counter to compare with */
+ int rs_last_events; /* counter of read or write "events" (unit sectors)
+ * on the lower level device when we last looked. */
+ int c_sync_rate; /* current resync rate after syncer throttle magic */
+ struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */
+ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+ unsigned int peer_max_bio_size;
+ unsigned int local_max_bio_size;
+
+ /* any requests that would block in drbd_make_request()
+ * are deferred to this single-threaded work queue */
+ struct submit_worker submit;
+};
+
+struct drbd_bm_aio_ctx {
+ struct drbd_device *device;
+ struct list_head list; /* on device->pending_bitmap_io */;
+ unsigned long start_jif;
+ atomic_t in_flight;
+ unsigned int done;
+ unsigned flags;
+#define BM_AIO_COPY_PAGES 1
+#define BM_AIO_WRITE_HINTED 2
+#define BM_AIO_WRITE_ALL_PAGES 4
+#define BM_AIO_READ 8
+ int error;
+ struct kref kref;
+};
+
+struct drbd_config_context {
+ /* assigned from drbd_genlmsghdr */
+ unsigned int minor;
+ /* assigned from request attributes, if present */
+ unsigned int volume;
+#define VOLUME_UNSPECIFIED (-1U)
+ /* pointer into the request skb,
+ * limited lifetime! */
+ char *resource_name;
+ struct nlattr *my_addr;
+ struct nlattr *peer_addr;
+
+ /* reply buffer */
+ struct sk_buff *reply_skb;
+ /* pointer into reply buffer */
+ struct drbd_genlmsghdr *reply_dh;
+ /* resolved from attributes, if possible */
+ struct drbd_device *device;
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+};
+
+static inline struct drbd_device *minor_to_device(unsigned int minor)
+{
+ return (struct drbd_device *)idr_find(&drbd_devices, minor);
+}
+
+static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
+{
+ return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
+}
+
+#define for_each_resource(resource, _resources) \
+ list_for_each_entry(resource, _resources, resources)
+
+#define for_each_resource_rcu(resource, _resources) \
+ list_for_each_entry_rcu(resource, _resources, resources)
+
+#define for_each_resource_safe(resource, tmp, _resources) \
+ list_for_each_entry_safe(resource, tmp, _resources, resources)
+
+#define for_each_connection(connection, resource) \
+ list_for_each_entry(connection, &resource->connections, connections)
+
+#define for_each_connection_rcu(connection, resource) \
+ list_for_each_entry_rcu(connection, &resource->connections, connections)
+
+#define for_each_connection_safe(connection, tmp, resource) \
+ list_for_each_entry_safe(connection, tmp, &resource->connections, connections)
+
+#define for_each_peer_device(peer_device, device) \
+ list_for_each_entry(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_rcu(peer_device, device) \
+ list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_safe(peer_device, tmp, device) \
+ list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices)
+
+static inline unsigned int device_to_minor(struct drbd_device *device)
+{
+ return device->minor;
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum dds_flags {
+ DDSF_FORCED = 1,
+ DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
+extern void drbd_init_set_defaults(struct drbd_device *device);
+extern int drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#endif
+extern void tl_release(struct drbd_connection *, unsigned int barrier_nr,
+ unsigned int set_size);
+extern void tl_clear(struct drbd_connection *);
+extern void drbd_free_sock(struct drbd_connection *connection);
+extern int drbd_send(struct drbd_connection *connection, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t,
+ unsigned);
+
+extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_connection *connection);
+extern int drbd_send_uuids(struct drbd_peer_device *);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *);
+extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags);
+extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_peer_device *);
+extern int drbd_send_sync_param(struct drbd_peer_device *);
+extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr,
+ u32 set_size);
+extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet,
+ struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet,
+ struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet,
+ struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet,
+ sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet,
+ struct drbd_peer_request *);
+extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req);
+extern int drbd_send_drequest(struct drbd_peer_device *, int cmd,
+ sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector,
+ int size, void *digest, int digest_size,
+ enum drbd_packet cmd);
+extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size);
+
+extern int drbd_send_bitmap(struct drbd_device *device);
+extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
+extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
+extern void drbd_device_cleanup(struct drbd_device *device);
+void drbd_print_uuids(struct drbd_device *device, const char *text);
+
+extern void conn_md_sync(struct drbd_connection *connection);
+extern void drbd_md_write(struct drbd_device *device, void *buffer);
+extern void drbd_md_sync(struct drbd_device *device);
+extern int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local);
+extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local);
+extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+extern void drbd_md_mark_dirty(struct drbd_device *device);
+extern void drbd_queue_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ void (*done)(struct drbd_device *, int),
+ char *why, enum bm_flag flags);
+extern int drbd_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags);
+extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
+
+/* Meta data layout
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * Variants:
+ * old, indexed fixed size meta data:
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ * end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ * The activity log consists of 4k transaction blocks,
+ * which are written in a ring-buffer, or striped ring-buffer like fashion,
+ * which are writtensize used to be fixed 32kB,
+ * but is about to become configurable.
+ */
+
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
+#define MD_4kB_SECT 8
+#define MD_32kB_SECT 64
+
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ * This many changes to the active set can be logged with one transaction.
+ * This number is arbitrary.
+ * context per transaction:
+ * This many context extent numbers are logged with each transaction.
+ * This number is resulting from the transaction block size (4k), the layout
+ * of the transaction header, and the number of updates per transaction.
+ * See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
+#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+ int rs_left; /* number of bits set (out of sync) in this extent. */
+ int rs_failed; /* number of failed resync requests in this extent. */
+ unsigned long flags;
+ struct lc_element lce;
+};
+
+#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define SLEEP_TIME (HZ/10)
+
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT 12 /* 4k per bit */
+#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */
+#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
+#define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+/* first storage sector a bitmap extent corresponds to */
+#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
+/* how much _storage_ sectors we have per bitmap extent */
+#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
+/* how many bits are covered by one bitmap extent (resync extent) */
+#define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1)
+
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ * of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit 0 bit 37 bit 38 bit (512*8)-1
+ * ...|........|........|.. // ..|........|
+ * sect. 0 `296 `304 ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
+#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+ ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
+#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
+/* corresponds to (1UL << 38) bits right now. */
+#endif
+#endif
+
+/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
+
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
+
+/* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE
+#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+
+extern int drbd_bm_init(struct drbd_device *device);
+extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
+extern void drbd_bm_cleanup(struct drbd_device *device);
+extern void drbd_bm_set_all(struct drbd_device *device);
+extern void drbd_bm_clear_all(struct drbd_device *device);
+/* set/clear/test only a few bits at a time */
+extern int drbd_bm_set_bits(
+ struct drbd_device *device, unsigned long s, unsigned long e);
+extern int drbd_bm_clear_bits(
+ struct drbd_device *device, unsigned long s, unsigned long e);
+extern int drbd_bm_count_bits(
+ struct drbd_device *device, const unsigned long s, const unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock,
+ * may process the whole bitmap in one go */
+extern void _drbd_bm_set_bits(struct drbd_device *device,
+ const unsigned long s, const unsigned long e);
+extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
+extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
+extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
+extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
+extern size_t drbd_bm_words(struct drbd_device *device);
+extern unsigned long drbd_bm_bits(struct drbd_device *device);
+extern sector_t drbd_bm_capacity(struct drbd_device *device);
+
+#define DRBD_END_OF_BITMAP (~(unsigned long)0)
+extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
+extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
+ size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap */
+extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset,
+ size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags);
+extern void drbd_bm_unlock(struct drbd_device *device);
+/* drbd_main.c */
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache; /* peer requests */
+extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
+extern mempool_t *drbd_request_mempool;
+extern mempool_t *drbd_ee_mempool;
+
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
+extern spinlock_t drbd_pp_lock;
+extern int drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES 128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
+extern rwlock_t global_state_lock;
+
+extern int conn_lowest_minor(struct drbd_connection *connection);
+extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
+extern void drbd_destroy_device(struct kref *kref);
+extern void drbd_delete_device(struct drbd_device *device);
+
+extern struct drbd_resource *drbd_create_resource(const char *name);
+extern void drbd_free_resource(struct drbd_resource *resource);
+
+extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts);
+extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts);
+extern void drbd_destroy_connection(struct kref *kref);
+extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len);
+extern struct drbd_resource *drbd_find_resource(const char *name);
+extern void drbd_destroy_resource(struct kref *kref);
+extern void conn_free_crypto(struct drbd_connection *connection);
+
+extern int proc_details;
+
+/* drbd_req */
+extern void do_submit(struct work_struct *ws);
+extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
+extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+extern void drbd_suspend_io(struct drbd_device *device);
+extern void drbd_resume_io(struct drbd_device *device);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int);
+enum determine_dev_size {
+ DS_ERROR_SHRINK = -3,
+ DS_ERROR_SPACE_MD = -2,
+ DS_ERROR = -1,
+ DS_UNCHANGED = 0,
+ DS_SHRUNK = 1,
+ DS_GREW = 2,
+ DS_GREW_FROM_ZERO = 3,
+};
+extern enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_device *);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
+ enum drbd_role new_role,
+ int force);
+extern bool conn_try_outdate_peer(struct drbd_connection *connection);
+extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
+extern int drbd_khelper(struct drbd_device *device, char *cmd);
+
+/* drbd_worker.c */
+/* bi_end_io handlers */
+extern void drbd_md_endio(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
+extern int drbd_worker(struct drbd_thread *thi);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
+void drbd_resync_after_changed(struct drbd_device *device);
+extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_device *device);
+extern void suspend_other_sg(struct drbd_device *device);
+extern int drbd_resync_finished(struct drbd_device *device);
+/* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
+extern void drbd_md_put_buffer(struct drbd_device *device);
+extern int drbd_md_sync_page_io(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int);
+extern void wait_until_done_or_force_detached(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, unsigned int *done);
+extern void drbd_rs_controller_reset(struct drbd_device *device);
+
+static inline void ov_out_of_sync_print(struct drbd_device *device)
+{
+ if (device->ov_last_oos_size) {
+ drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n",
+ (unsigned long long)device->ov_last_oos_start,
+ (unsigned long)device->ov_last_oos_size);
+ }
+ device->ov_last_oos_size = 0;
+}
+
+
+extern void drbd_csum_bio(struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct crypto_hash *, struct drbd_peer_request *, void *);
+/* worker callbacks */
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+extern int w_start_resync(struct drbd_work *, int);
+
+extern void resync_timer_fn(unsigned long data);
+extern void start_resync_timer_fn(unsigned long data);
+
+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
+/* drbd_receiver.c */
+extern int drbd_receiver(struct drbd_thread *thi);
+extern int drbd_asender(struct drbd_thread *thi);
+extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+ bool throttle_if_app_is_waiting);
+extern int drbd_submit_peer_request(struct drbd_device *,
+ struct drbd_peer_request *, const unsigned,
+ const int);
+extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
+ sector_t, unsigned int,
+ bool,
+ gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
+ int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
+extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
+extern int drbd_connected(struct drbd_peer_device *);
+
+static inline void drbd_tcp_cork(struct socket *sock)
+{
+ int val = 1;
+ (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
+ (char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_uncork(struct socket *sock)
+{
+ int val = 0;
+ (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
+ (char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+ int val = 1;
+ (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+ int val = 2;
+ (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+ (char*)&val, sizeof(val));
+}
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_device *device,
+ sector_t size)
+{
+ /* set_capacity(device->this_bdev->bd_disk, size); */
+ set_capacity(device->vdisk, size);
+ device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_device *device,
+ int fault_type, struct bio *bio)
+{
+ __release(local);
+ if (!bio->bi_bdev) {
+ drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
+ bio_endio(bio, -ENODEV);
+ return;
+ }
+
+ if (drbd_insert_fault(device, fault_type))
+ bio_endio(bio, -EIO);
+ else
+ generic_make_request(bio);
+}
+
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+ enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+extern const struct file_operations drbd_proc_fops;
+extern const char *drbd_conn_str(enum drbd_conns s);
+extern const char *drbd_role_str(enum drbd_role s);
+
+/* drbd_actlog.c */
+extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_device *device);
+extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_device *device);
+extern int drbd_rs_del_all(struct drbd_device *device);
+extern void drbd_rs_failed_io(struct drbd_device *device,
+ sector_t sector, int size);
+extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
+
+enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
+extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+ enum update_sync_bits_mode mode);
+#define drbd_set_in_sync(device, sector, size) \
+ __drbd_change_sync(device, sector, size, SET_IN_SYNC)
+#define drbd_set_out_of_sync(device, sector, size) \
+ __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC)
+#define drbd_rs_failed_io(device, sector, size) \
+ __drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
+extern void drbd_al_shrink(struct drbd_device *device);
+extern int drbd_initialize_al(struct drbd_device *, void *);
+
+/* drbd_nl.c */
+/* state info broadcast */
+struct sib_info {
+ enum drbd_state_info_bcast_reason sib_reason;
+ union {
+ struct {
+ char *helper_name;
+ unsigned helper_exit_code;
+ };
+ struct {
+ union drbd_state os;
+ union drbd_state ns;
+ };
+ };
+};
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
+
+/*
+ * inline helper functions
+ *************************/
+
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+ return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+ for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+ page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+ for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
+{
+ struct page *page = peer_req->pages;
+ page_chain_for_each(page) {
+ if (page_count(page) > 1)
+ return 1;
+ }
+ return 0;
+}
+
+static inline enum drbd_state_rv
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+ enum chg_state_flags flags, struct completion *done)
+{
+ enum drbd_state_rv rv;
+
+ read_lock(&global_state_lock);
+ rv = __drbd_set_state(device, ns, flags, done);
+ read_unlock(&global_state_lock);
+
+ return rv;
+}
+
+static inline union drbd_state drbd_read_state(struct drbd_device *device)
+{
+ struct drbd_resource *resource = device->resource;
+ union drbd_state rv;
+
+ rv.i = device->state.i;
+ rv.susp = resource->susp;
+ rv.susp_nod = resource->susp_nod;
+ rv.susp_fen = resource->susp_fen;
+
+ return rv;
+}
+
+enum drbd_force_detach_flags {
+ DRBD_READ_ERROR,
+ DRBD_WRITE_ERROR,
+ DRBD_META_IO_ERROR,
+ DRBD_FORCE_DETACH,
+};
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_device *device,
+ enum drbd_force_detach_flags df,
+ const char *where)
+{
+ enum drbd_io_error_p ep;
+
+ rcu_read_lock();
+ ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+ switch (ep) {
+ case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
+ if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Local IO failed in %s.\n", where);
+ if (device->state.disk > D_INCONSISTENT)
+ _drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
+ break;
+ }
+ /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
+ case EP_DETACH:
+ case EP_CALL_HELPER:
+ /* Remember whether we saw a READ or WRITE error.
+ *
+ * Recovery of the affected area for WRITE failure is covered
+ * by the activity log.
+ * READ errors may fall outside that area though. Certain READ
+ * errors can be "healed" by writing good data to the affected
+ * blocks, which triggers block re-allocation in lower layers.
+ *
+ * If we can not write the bitmap after a READ error,
+ * we may need to trigger a full sync (see w_go_diskless()).
+ *
+ * Force-detach is not really an IO error, but rather a
+ * desperate measure to try to deal with a completely
+ * unresponsive lower level IO stack.
+ * Still it should be treated as a WRITE error.
+ *
+ * Meta IO error is always WRITE error:
+ * we read meta data only once during attach,
+ * which will fail in case of errors.
+ */
+ set_bit(WAS_IO_ERROR, &device->flags);
+ if (df == DRBD_READ_ERROR)
+ set_bit(WAS_READ_ERROR, &device->flags);
+ if (df == DRBD_FORCE_DETACH)
+ set_bit(FORCE_DETACH, &device->flags);
+ if (device->state.disk > D_FAILED) {
+ _drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);
+ drbd_err(device,
+ "Local IO failed in %s. Detaching...\n", where);
+ }
+ break;
+ }
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @device: DRBD device.
+ * @error: Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_device *device,
+ int error, enum drbd_force_detach_flags forcedetach, const char *where)
+{
+ if (error) {
+ unsigned long flags;
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ __drbd_chk_io_error_(device, forcedetach, where);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ }
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev: Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+ switch (bdev->md.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ return bdev->md.md_offset + bdev->md.bm_offset;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ default:
+ return bdev->md.md_offset;
+ }
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev: Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+ switch (bdev->md.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ return bdev->md.md_offset + MD_4kB_SECT -1;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ default:
+ return bdev->md.md_offset + bdev->md.md_size_sect -1;
+ }
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+ /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
+ return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev: Meta data block device.
+ *
+ * returns the capacity we announce to out peer. we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+ sector_t s;
+
+ switch (bdev->md.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ s = drbd_get_capacity(bdev->backing_bdev)
+ ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+ drbd_md_first_sector(bdev))
+ : 0;
+ break;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+ drbd_get_capacity(bdev->backing_bdev));
+ /* clip at maximum size the meta device can support */
+ s = min_t(sector_t, s,
+ BM_EXT_TO_SECT(bdev->md.md_size_sect
+ - bdev->md.bm_offset));
+ break;
+ default:
+ s = min_t(sector_t, DRBD_MAX_SECTORS,
+ drbd_get_capacity(bdev->backing_bdev));
+ }
+ return s;
+}
+
+/**
+ * drbd_md_ss() - Return the sector number of our meta data super block
+ * @bdev: Meta data block device.
+ */
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
+{
+ const int meta_dev_idx = bdev->md.meta_dev_idx;
+
+ if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
+ return 0;
+
+ /* Since drbd08, internal meta data is always "flexible".
+ * position: last 4k aligned block of 4k size */
+ if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+ return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+ /* external, some index; this is the old fixed size layout */
+ return MD_128MB_SECT * bdev->md.meta_dev_idx;
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&q->q_lock, flags);
+ list_add_tail(&w->list, &q->q);
+ spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&q->q_lock, flags);
+ if (list_empty_careful(&w->list))
+ list_add_tail(&w->list, &q->q);
+ spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_device_post_work(struct drbd_device *device, int work_bit)
+{
+ if (!test_and_set_bit(work_bit, &device->flags)) {
+ struct drbd_connection *connection =
+ first_peer_device(device)->connection;
+ struct drbd_work_queue *q = &connection->sender_work;
+ if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+ wake_up(&q->q_wait);
+ }
+}
+
+extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
+
+static inline void wake_asender(struct drbd_connection *connection)
+{
+ if (test_bit(SIGNAL_ASENDER, &connection->flags))
+ force_sig(DRBD_SIG, connection->asender.task);
+}
+
+static inline void request_ping(struct drbd_connection *connection)
+{
+ set_bit(SEND_PING, &connection->flags);
+ wake_asender(connection);
+}
+
+extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_connection *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
+extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
+
+extern int drbd_send_ping(struct drbd_connection *connection);
+extern int drbd_send_ping_ack(struct drbd_connection *connection);
+extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state);
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, false, true);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, false, false);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, true, false);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ * w_send_barrier
+ * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
+ * it is much easier and equally valid to count what we queue for the
+ * worker, even before it actually was queued or send.
+ * (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ * got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ * _req_mod(req, DATA_RECEIVED)
+ * [from receive_DataReply]
+ * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
+ * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ * for some reason it is NOT decreased in got_NegAck,
+ * but in the resulting cleanup code from report_params.
+ * we should try to remember the reason for that...
+ * _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ * _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
+ * [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_device *device)
+{
+ atomic_inc(&device->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \
+ if (atomic_read(&device->which) < 0) \
+ drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n", \
+ func, line, \
+ atomic_read(&device->which))
+
+#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
+{
+ if (atomic_dec_and_test(&device->ap_pending_cnt))
+ wake_up(&device->misc_wait);
+ ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
+
+/* counts how many resync-related answers we still expect from the peer
+ * increase decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER)
+ * (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_device *device)
+{
+ atomic_inc(&device->rs_pending_cnt);
+}
+
+#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
+{
+ atomic_dec(&device->rs_pending_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ * receive_Data unless protocol A;
+ * we need to send a P_RECV_ACK (proto B)
+ * or P_WRITE_ACK (proto C)
+ * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ * receive_Barrier_* we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_device *device)
+{
+ atomic_inc(&device->unacked_cnt);
+}
+
+#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
+static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
+{
+ atomic_dec(&device->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
+static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
+{
+ atomic_sub(n, &device->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+static inline bool is_sync_state(enum drbd_conns connection_state)
+{
+ return
+ (connection_state == C_SYNC_SOURCE
+ || connection_state == C_SYNC_TARGET
+ || connection_state == C_PAUSED_SYNC_S
+ || connection_state == C_PAUSED_SYNC_T);
+}
+
+/**
+ * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
+ * @_device: DRBD device.
+ * @_min_state: Minimum device state required for success.
+ *
+ * You have to call put_ldev() when finished working with device->ldev.
+ */
+#define get_ldev_if_state(_device, _min_state) \
+ (_get_ldev_if_state((_device), (_min_state)) ? \
+ ({ __acquire(x); true; }) : false)
+#define get_ldev(_device) get_ldev_if_state(_device, D_INCONSISTENT)
+
+static inline void put_ldev(struct drbd_device *device)
+{
+ enum drbd_disk_state disk_state = device->state.disk;
+ /* We must check the state *before* the atomic_dec becomes visible,
+ * or we have a theoretical race where someone hitting zero,
+ * while state still D_FAILED, will then see D_DISKLESS in the
+ * condition below and calling into destroy, where he must not, yet. */
+ int i = atomic_dec_return(&device->local_cnt);
+
+ /* This may be called from some endio handler,
+ * so we must not sleep here. */
+
+ __release(local);
+ D_ASSERT(device, i >= 0);
+ if (i == 0) {
+ if (disk_state == D_DISKLESS)
+ /* even internal references gone, safe to destroy */
+ drbd_device_post_work(device, DESTROY_DISK);
+ if (disk_state == D_FAILED)
+ /* all application IO references gone. */
+ if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
+ drbd_device_post_work(device, GO_DISKLESS);
+ wake_up(&device->misc_wait);
+ }
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+ int io_allowed;
+
+ /* never get a reference while D_DISKLESS */
+ if (device->state.disk == D_DISKLESS)
+ return 0;
+
+ atomic_inc(&device->local_cnt);
+ io_allowed = (device->state.disk >= mins);
+ if (!io_allowed)
+ put_ldev(device);
+ return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
+#endif
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_device *device)
+{
+ struct net_conf *nc;
+ int mxb;
+
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */
+ rcu_read_unlock();
+
+ return mxb;
+}
+
+static inline int drbd_state_is_stable(struct drbd_device *device)
+{
+ union drbd_dev_state s = device->state;
+
+ /* DO NOT add a default clause, we want the compiler to warn us
+ * for any newly introduced state we may have forgotten to add here */
+
+ switch ((enum drbd_conns)s.conn) {
+ /* new io only accepted when there is no connection, ... */
+ case C_STANDALONE:
+ case C_WF_CONNECTION:
+ /* ... or there is a well established connection. */
+ case C_CONNECTED:
+ case C_SYNC_SOURCE:
+ case C_SYNC_TARGET:
+ case C_VERIFY_S:
+ case C_VERIFY_T:
+ case C_PAUSED_SYNC_S:
+ case C_PAUSED_SYNC_T:
+ case C_AHEAD:
+ case C_BEHIND:
+ /* transitional states, IO allowed */
+ case C_DISCONNECTING:
+ case C_UNCONNECTED:
+ case C_TIMEOUT:
+ case C_BROKEN_PIPE:
+ case C_NETWORK_FAILURE:
+ case C_PROTOCOL_ERROR:
+ case C_TEAR_DOWN:
+ case C_WF_REPORT_PARAMS:
+ case C_STARTING_SYNC_S:
+ case C_STARTING_SYNC_T:
+ break;
+
+ /* Allow IO in BM exchange states with new protocols */
+ case C_WF_BITMAP_S:
+ if (first_peer_device(device)->connection->agreed_pro_version < 96)
+ return 0;
+ break;
+
+ /* no new io accepted in these states */
+ case C_WF_BITMAP_T:
+ case C_WF_SYNC_UUID:
+ case C_MASK:
+ /* not "stable" */
+ return 0;
+ }
+
+ switch ((enum drbd_disk_state)s.disk) {
+ case D_DISKLESS:
+ case D_INCONSISTENT:
+ case D_OUTDATED:
+ case D_CONSISTENT:
+ case D_UP_TO_DATE:
+ case D_FAILED:
+ /* disk state is stable as well. */
+ break;
+
+ /* no new io accepted during transitional states */
+ case D_ATTACHING:
+ case D_NEGOTIATING:
+ case D_UNKNOWN:
+ case D_MASK:
+ /* not "stable" */
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int drbd_suspended(struct drbd_device *device)
+{
+ struct drbd_resource *resource = device->resource;
+
+ return resource->susp || resource->susp_fen || resource->susp_nod;
+}
+
+static inline bool may_inc_ap_bio(struct drbd_device *device)
+{
+ int mxb = drbd_get_max_buffers(device);
+
+ if (drbd_suspended(device))
+ return false;
+ if (test_bit(SUSPEND_IO, &device->flags))
+ return false;
+
+ /* to avoid potential deadlock or bitmap corruption,
+ * in various places, we only allow new application io
+ * to start during "stable" states. */
+
+ /* no new io accepted when attaching or detaching the disk */
+ if (!drbd_state_is_stable(device))
+ return false;
+
+ /* since some older kernels don't have atomic_add_unless,
+ * and we are within the spinlock anyways, we have this workaround. */
+ if (atomic_read(&device->ap_bio_cnt) > mxb)
+ return false;
+ if (test_bit(BITMAP_IO, &device->flags))
+ return false;
+ return true;
+}
+
+static inline bool inc_ap_bio_cond(struct drbd_device *device)
+{
+ bool rv = false;
+
+ spin_lock_irq(&device->resource->req_lock);
+ rv = may_inc_ap_bio(device);
+ if (rv)
+ atomic_inc(&device->ap_bio_cnt);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ return rv;
+}
+
+static inline void inc_ap_bio(struct drbd_device *device)
+{
+ /* we wait here
+ * as long as the device is suspended
+ * until the bitmap is no longer on the fly during connection
+ * handshake as long as we would exceed the max_buffer limit.
+ *
+ * to avoid races with the reconnect code,
+ * we need to atomic_inc within the spinlock. */
+
+ wait_event(device->misc_wait, inc_ap_bio_cond(device));
+}
+
+static inline void dec_ap_bio(struct drbd_device *device)
+{
+ int mxb = drbd_get_max_buffers(device);
+ int ap_bio = atomic_dec_return(&device->ap_bio_cnt);
+
+ D_ASSERT(device, ap_bio >= 0);
+
+ if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+ drbd_queue_work(&first_peer_device(device)->
+ connection->sender_work,
+ &device->bm_io_work.w);
+ }
+
+ /* this currently does wake_up for every dec_ap_bio!
+ * maybe rather introduce some type of hysteresis?
+ * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+ if (ap_bio < mxb)
+ wake_up(&device->misc_wait);
+}
+
+static inline bool verify_can_do_stop_sector(struct drbd_device *device)
+{
+ return first_peer_device(device)->connection->agreed_pro_version >= 97 &&
+ first_peer_device(device)->connection->agreed_pro_version != 100;
+}
+
+static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val)
+{
+ int changed = device->ed_uuid != val;
+ device->ed_uuid = val;
+ return changed;
+}
+
+static inline int drbd_queue_order_type(struct drbd_device *device)
+{
+ /* sorry, we currently have no working implementation
+ * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+ return QUEUE_ORDERED_NONE;
+}
+
+static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
+{
+ return list_first_entry_or_null(&resource->connections,
+ struct drbd_connection, connections);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 000000000..51b25ad85
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,179 @@
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/**
+ * interval_end - return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+ struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+ return this->end;
+}
+
+/**
+ * compute_subtree_last - compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children. Called for @node and its parents whenever the end
+ * may have changed.
+ */
+static inline sector_t
+compute_subtree_last(struct drbd_interval *node)
+{
+ sector_t max = node->sector + (node->size >> 9);
+
+ if (node->rb.rb_left) {
+ sector_t left = interval_end(node->rb.rb_left);
+ if (left > max)
+ max = left;
+ }
+ if (node->rb.rb_right) {
+ sector_t right = interval_end(node->rb.rb_right);
+ if (right > max)
+ max = right;
+ }
+ return max;
+}
+
+RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb,
+ sector_t, end, compute_subtree_last);
+
+/**
+ * drbd_insert_interval - insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+ sector_t this_end = this->sector + (this->size >> 9);
+
+ BUG_ON(!IS_ALIGNED(this->size, 512));
+
+ while (*new) {
+ struct drbd_interval *here =
+ rb_entry(*new, struct drbd_interval, rb);
+
+ parent = *new;
+ if (here->end < this_end)
+ here->end = this_end;
+ if (this->sector < here->sector)
+ new = &(*new)->rb_left;
+ else if (this->sector > here->sector)
+ new = &(*new)->rb_right;
+ else if (this < here)
+ new = &(*new)->rb_left;
+ else if (this > here)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ this->end = this_end;
+ rb_link_node(&this->rb, parent, new);
+ rb_insert_augmented(&this->rb, root, &augment_callbacks);
+ return true;
+}
+
+/**
+ * drbd_contains_interval - check if a tree contains a given interval
+ * @sector: start sector of @interval
+ * @interval: may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree. Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+ struct drbd_interval *interval)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (sector < here->sector)
+ node = node->rb_left;
+ else if (sector > here->sector)
+ node = node->rb_right;
+ else if (interval < here)
+ node = node->rb_left;
+ else if (interval > here)
+ node = node->rb_right;
+ else
+ return true;
+ }
+ return false;
+}
+
+/**
+ * drbd_remove_interval - remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
+ * @sector: start sector
+ * @size: size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none. When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+ struct rb_node *node = root->rb_node;
+ struct drbd_interval *overlap = NULL;
+ sector_t end = sector + (size >> 9);
+
+ BUG_ON(!IS_ALIGNED(size, 512));
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (node->rb_left &&
+ sector < interval_end(node->rb_left)) {
+ /* Overlap if any must be on left side */
+ node = node->rb_left;
+ } else if (here->sector < end &&
+ sector < here->sector + (here->size >> 9)) {
+ overlap = here;
+ break;
+ } else if (sector >= here->sector) {
+ /* Overlap if any must be on right side */
+ node = node->rb_right;
+ } else
+ break;
+ }
+ return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+ sector_t end = sector + (size >> 9);
+ struct rb_node *node;
+
+ for (;;) {
+ node = rb_next(&i->rb);
+ if (!node)
+ return NULL;
+ i = rb_entry(node, struct drbd_interval, rb);
+ if (i->sector >= end)
+ return NULL;
+ if (sector < i->sector + (i->size >> 9))
+ return i;
+ }
+}
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 000000000..f210543f0
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,42 @@
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+ struct rb_node rb;
+ sector_t sector; /* start sector of the interval */
+ unsigned int size; /* size in bytes */
+ sector_t end; /* highest interval end in subtree */
+ int local:1 /* local or remote request? */;
+ int waiting:1; /* someone is waiting for this to complete */
+ int completed:1; /* this has been completed already;
+ * ignore for conflict detection */
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+ RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+ return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+ struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+ unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+ unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size) \
+ for (i = drbd_find_overlap(root, sector, size); \
+ i; \
+ i = drbd_next_overlap(i, sector, size))
+
+#endif /* __DRBD_INTERVAL_H */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000..81fde9ef7
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3844 @@
+/*
+ drbd.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+ from Logicworks, Inc. for making SDP replication support possible.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/drbd.h>
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+#include "drbd_vli.h"
+#include "drbd_debugfs.h"
+
+static DEFINE_MUTEX(drbd_main_mutex);
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static void drbd_release(struct gendisk *gd, fmode_t mode);
+static void md_sync_timer_fn(unsigned long data);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+ "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
+ __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* allow_open_on_secondary */
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * this becomes the boot parameter drbd.minor_count */
+module_param(minor_count, uint, 0444);
+module_param(disable_sendpage, bool, 0644);
+module_param(allow_oos, bool, 0);
+module_param(proc_details, int, 0644);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+int enable_faults;
+int fault_rate;
+static int fault_count;
+int fault_devs;
+/* bitmap of enabled faults */
+module_param(enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param(fault_rate, int, 0664);
+/* count of faults inserted */
+module_param(fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param(fault_devs, int, 0644);
+#endif
+
+/* module parameter, defined */
+unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
+bool disable_sendpage;
+bool allow_oos;
+int proc_details; /* Detail level in proc drbd*/
+
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char usermode_helper[80] = "/sbin/drbdadm";
+
+module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct idr drbd_devices;
+struct list_head drbd_resources;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache; /* peer requests */
+struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
+mempool_t *drbd_request_mempool;
+mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
+
+/* I do not use a standard mempool, because:
+ 1) I want to hand out the pre-allocated objects first.
+ 2) I want to be able to interrupt sleeping allocation with a signal.
+ Note: This is a single linked list, the next pointer is the private
+ member of struct page.
+ */
+struct page *drbd_pp_pool;
+spinlock_t drbd_pp_lock;
+int drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
+
+static const struct block_device_operations drbd_ops = {
+ .owner = THIS_MODULE,
+ .open = drbd_open,
+ .release = drbd_release,
+};
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+ struct bio *bio;
+
+ if (!drbd_md_io_bio_set)
+ return bio_alloc(gfp_mask, 1);
+
+ bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ if (!bio)
+ return NULL;
+ return bio;
+}
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+ give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+ int io_allowed;
+
+ atomic_inc(&device->local_cnt);
+ io_allowed = (device->state.disk >= mins);
+ if (!io_allowed) {
+ if (atomic_dec_and_test(&device->local_cnt))
+ wake_up(&device->misc_wait);
+ }
+ return io_allowed;
+}
+
+#endif
+
+/**
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @connection: DRBD connection.
+ * @barrier_nr: Expected identifier of the DRBD write barrier packet.
+ * @set_size: Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
+ */
+void tl_release(struct drbd_connection *connection, unsigned int barrier_nr,
+ unsigned int set_size)
+{
+ struct drbd_request *r;
+ struct drbd_request *req = NULL;
+ int expect_epoch = 0;
+ int expect_size = 0;
+
+ spin_lock_irq(&connection->resource->req_lock);
+
+ /* find oldest not yet barrier-acked write request,
+ * count writes in its epoch. */
+ list_for_each_entry(r, &connection->transfer_log, tl_requests) {
+ const unsigned s = r->rq_state;
+ if (!req) {
+ if (!(s & RQ_WRITE))
+ continue;
+ if (!(s & RQ_NET_MASK))
+ continue;
+ if (s & RQ_NET_DONE)
+ continue;
+ req = r;
+ expect_epoch = req->epoch;
+ expect_size ++;
+ } else {
+ if (r->epoch != expect_epoch)
+ break;
+ if (!(s & RQ_WRITE))
+ continue;
+ /* if (s & RQ_DONE): not expected */
+ /* if (!(s & RQ_NET_MASK)): not expected */
+ expect_size++;
+ }
+ }
+
+ /* first some paranoia code */
+ if (req == NULL) {
+ drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+ barrier_nr);
+ goto bail;
+ }
+ if (expect_epoch != barrier_nr) {
+ drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
+ barrier_nr, expect_epoch);
+ goto bail;
+ }
+
+ if (expect_size != set_size) {
+ drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+ barrier_nr, set_size, expect_size);
+ goto bail;
+ }
+
+ /* Clean up list of requests processed during current epoch. */
+ /* this extra list walk restart is paranoia,
+ * to catch requests being barrier-acked "unexpectedly".
+ * It usually should find the same req again, or some READ preceding it. */
+ list_for_each_entry(req, &connection->transfer_log, tl_requests)
+ if (req->epoch == expect_epoch)
+ break;
+ list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
+ if (req->epoch != expect_epoch)
+ break;
+ _req_mod(req, BARRIER_ACKED);
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ return;
+
+bail:
+ spin_unlock_irq(&connection->resource->req_lock);
+ conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @connection: DRBD connection to operate on.
+ * @what: The action/event to perform with all request objects
+ *
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
+ */
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+ struct drbd_request *req, *r;
+
+ list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests)
+ _req_mod(req, what);
+}
+
+void tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+ spin_lock_irq(&connection->resource->req_lock);
+ _tl_restart(connection, what);
+ spin_unlock_irq(&connection->resource->req_lock);
+}
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @device: DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_connection *connection)
+{
+ tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+}
+
+/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
+ * @device: DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_device *device)
+{
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct drbd_request *req, *r;
+
+ spin_lock_irq(&connection->resource->req_lock);
+ list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
+ if (!(req->rq_state & RQ_LOCAL_PENDING))
+ continue;
+ if (req->device != device)
+ continue;
+ _req_mod(req, ABORT_DISK_IO);
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+}
+
+static int drbd_thread_setup(void *arg)
+{
+ struct drbd_thread *thi = (struct drbd_thread *) arg;
+ struct drbd_resource *resource = thi->resource;
+ unsigned long flags;
+ int retval;
+
+ snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+ thi->name[0],
+ resource->name);
+
+restart:
+ retval = thi->function(thi);
+
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ /* if the receiver has been "EXITING", the last thing it did
+ * was set the conn state to "StandAlone",
+ * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+ * and receiver thread will be "started".
+ * drbd_thread_start needs to set "RESTARTING" in that case.
+ * t_state check and assignment needs to be within the same spinlock,
+ * so either thread_start sees EXITING, and can remap to RESTARTING,
+ * or thread_start see NONE, and can proceed as normal.
+ */
+
+ if (thi->t_state == RESTARTING) {
+ drbd_info(resource, "Restarting %s thread\n", thi->name);
+ thi->t_state = RUNNING;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ goto restart;
+ }
+
+ thi->task = NULL;
+ thi->t_state = NONE;
+ smp_mb();
+ complete_all(&thi->stop);
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+
+ drbd_info(resource, "Terminating %s\n", current->comm);
+
+ /* Release mod reference taken when thread was started */
+
+ if (thi->connection)
+ kref_put(&thi->connection->kref, drbd_destroy_connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
+ module_put(THIS_MODULE);
+ return retval;
+}
+
+static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi,
+ int (*func) (struct drbd_thread *), const char *name)
+{
+ spin_lock_init(&thi->t_lock);
+ thi->task = NULL;
+ thi->t_state = NONE;
+ thi->function = func;
+ thi->resource = resource;
+ thi->connection = NULL;
+ thi->name = name;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+ struct drbd_resource *resource = thi->resource;
+ struct task_struct *nt;
+ unsigned long flags;
+
+ /* is used from state engine doing drbd_thread_stop_nowait,
+ * while holding the req lock irqsave */
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ switch (thi->t_state) {
+ case NONE:
+ drbd_info(resource, "Starting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
+
+ /* Get ref on module for thread - this is released when thread exits */
+ if (!try_module_get(THIS_MODULE)) {
+ drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ return false;
+ }
+
+ kref_get(&resource->kref);
+ if (thi->connection)
+ kref_get(&thi->connection->kref);
+
+ init_completion(&thi->stop);
+ thi->reset_cpu_mask = 1;
+ thi->t_state = RUNNING;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+ nt = kthread_create(drbd_thread_setup, (void *) thi,
+ "drbd_%c_%s", thi->name[0], thi->resource->name);
+
+ if (IS_ERR(nt)) {
+ drbd_err(resource, "Couldn't start thread\n");
+
+ if (thi->connection)
+ kref_put(&thi->connection->kref, drbd_destroy_connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
+ module_put(THIS_MODULE);
+ return false;
+ }
+ spin_lock_irqsave(&thi->t_lock, flags);
+ thi->task = nt;
+ thi->t_state = RUNNING;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ wake_up_process(nt);
+ break;
+ case EXITING:
+ thi->t_state = RESTARTING;
+ drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
+ /* fall through */
+ case RUNNING:
+ case RESTARTING:
+ default:
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ break;
+ }
+
+ return true;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+ unsigned long flags;
+
+ enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
+
+ /* may be called from state engine, holding the req lock irqsave */
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ if (thi->t_state == NONE) {
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ if (restart)
+ drbd_thread_start(thi);
+ return;
+ }
+
+ if (thi->t_state != ns) {
+ if (thi->task == NULL) {
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ return;
+ }
+
+ thi->t_state = ns;
+ smp_mb();
+ init_completion(&thi->stop);
+ if (thi->task != current)
+ force_sig(DRBD_SIGKILL, thi->task);
+ }
+
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+
+ if (wait)
+ wait_for_completion(&thi->stop);
+}
+
+int conn_lowest_minor(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr = 0, minor = -1;
+
+ rcu_read_lock();
+ peer_device = idr_get_next(&connection->peer_devices, &vnr);
+ if (peer_device)
+ minor = device_to_minor(peer_device->device);
+ rcu_read_unlock();
+
+ return minor;
+}
+
+#ifdef CONFIG_SMP
+/**
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ *
+ * Forces all threads of a resource onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
+{
+ unsigned int *resources_per_cpu, min_index = ~0;
+
+ resources_per_cpu = kzalloc(nr_cpu_ids * sizeof(*resources_per_cpu), GFP_KERNEL);
+ if (resources_per_cpu) {
+ struct drbd_resource *resource;
+ unsigned int cpu, min = ~0;
+
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ for_each_cpu(cpu, resource->cpu_mask)
+ resources_per_cpu[cpu]++;
+ }
+ rcu_read_unlock();
+ for_each_online_cpu(cpu) {
+ if (resources_per_cpu[cpu] < min) {
+ min = resources_per_cpu[cpu];
+ min_index = cpu;
+ }
+ }
+ kfree(resources_per_cpu);
+ }
+ if (min_index == ~0) {
+ cpumask_setall(*cpu_mask);
+ return;
+ }
+ cpumask_set_cpu(min_index, *cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @device: DRBD device.
+ * @thi: drbd_thread object
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
+{
+ struct drbd_resource *resource = thi->resource;
+ struct task_struct *p = current;
+
+ if (!thi->reset_cpu_mask)
+ return;
+ thi->reset_cpu_mask = 0;
+ set_cpus_allowed_ptr(p, resource->cpu_mask);
+}
+#else
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+
+/**
+ * drbd_header_size - size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures. (The bitmap send and receive code
+ * relies on this.)
+ */
+unsigned int drbd_header_size(struct drbd_connection *connection)
+{
+ if (connection->agreed_pro_version >= 100) {
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+ return sizeof(struct p_header100);
+ } else {
+ BUILD_BUG_ON(sizeof(struct p_header80) !=
+ sizeof(struct p_header95));
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+ return sizeof(struct p_header80);
+ }
+}
+
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be16(size);
+ return sizeof(struct p_header80);
+}
+
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be32(size);
+ return sizeof(struct p_header95);
+}
+
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+ int size, int vnr)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC_100);
+ h->volume = cpu_to_be16(vnr);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be32(size);
+ h->pad = 0;
+ return sizeof(struct p_header100);
+}
+
+static unsigned int prepare_header(struct drbd_connection *connection, int vnr,
+ void *buffer, enum drbd_packet cmd, int size)
+{
+ if (connection->agreed_pro_version >= 100)
+ return prepare_header100(buffer, cmd, size, vnr);
+ else if (connection->agreed_pro_version >= 95 &&
+ size > DRBD_MAX_SIZE_H80_PACKET)
+ return prepare_header95(buffer, cmd, size);
+ else
+ return prepare_header80(buffer, cmd, size);
+}
+
+static void *__conn_prepare_command(struct drbd_connection *connection,
+ struct drbd_socket *sock)
+{
+ if (!sock->socket)
+ return NULL;
+ return sock->sbuf + drbd_header_size(connection);
+}
+
+void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock)
+{
+ void *p;
+
+ mutex_lock(&sock->mutex);
+ p = __conn_prepare_command(connection, sock);
+ if (!p)
+ mutex_unlock(&sock->mutex);
+
+ return p;
+}
+
+void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock)
+{
+ return conn_prepare_command(peer_device->connection, sock);
+}
+
+static int __send_command(struct drbd_connection *connection, int vnr,
+ struct drbd_socket *sock, enum drbd_packet cmd,
+ unsigned int header_size, void *data,
+ unsigned int size)
+{
+ int msg_flags;
+ int err;
+
+ /*
+ * Called with @data == NULL and the size of the data blocks in @size
+ * for commands that send data blocks. For those commands, omit the
+ * MSG_MORE flag: this will increase the likelihood that data blocks
+ * which are page aligned on the sender will end up page aligned on the
+ * receiver.
+ */
+ msg_flags = data ? MSG_MORE : 0;
+
+ header_size += prepare_header(connection, vnr, sock->sbuf, cmd,
+ header_size + size);
+ err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
+ msg_flags);
+ if (data && !err)
+ err = drbd_send_all(connection, sock->socket, data, size, 0);
+ /* DRBD protocol "pings" are latency critical.
+ * This is supposed to trigger tcp_push_pending_frames() */
+ if (!err && (cmd == P_PING || cmd == P_PING_ACK))
+ drbd_tcp_nodelay(sock->socket);
+
+ return err;
+}
+
+static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ return __send_command(connection, 0, sock, cmd, header_size, data, size);
+}
+
+int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
+
+ err = __conn_send_command(connection, sock, cmd, header_size, data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
+
+ err = __send_command(peer_device->connection, peer_device->device->vnr,
+ sock, cmd, header_size, data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+int drbd_send_ping(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+
+ sock = &connection->meta;
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, P_PING, 0, NULL, 0);
+}
+
+int drbd_send_ping_ack(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+
+ sock = &connection->meta;
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0);
+}
+
+int drbd_send_sync_param(struct drbd_peer_device *peer_device)
+{
+ struct drbd_socket *sock;
+ struct p_rs_param_95 *p;
+ int size;
+ const int apv = peer_device->connection->agreed_pro_version;
+ enum drbd_packet cmd;
+ struct net_conf *nc;
+ struct disk_conf *dc;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+
+ size = apv <= 87 ? sizeof(struct p_rs_param)
+ : apv == 88 ? sizeof(struct p_rs_param)
+ + strlen(nc->verify_alg) + 1
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+ cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+ /* initialize verify_alg and csums_alg */
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+ if (get_ldev(peer_device->device)) {
+ dc = rcu_dereference(peer_device->device->ldev->disk_conf);
+ p->resync_rate = cpu_to_be32(dc->resync_rate);
+ p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+ p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+ p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+ p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+ put_ldev(peer_device->device);
+ } else {
+ p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+ p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+ p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+ p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+ p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+ }
+
+ if (apv >= 88)
+ strcpy(p->verify_alg, nc->verify_alg);
+ if (apv >= 89)
+ strcpy(p->csums_alg, nc->csums_alg);
+ rcu_read_unlock();
+
+ return drbd_send_command(peer_device, sock, cmd, size, NULL, 0);
+}
+
+int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd)
+{
+ struct drbd_socket *sock;
+ struct p_protocol *p;
+ struct net_conf *nc;
+ int size, cf;
+
+ sock = &connection->data;
+ p = __conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+
+ if (nc->tentative && connection->agreed_pro_version < 92) {
+ rcu_read_unlock();
+ mutex_unlock(&sock->mutex);
+ drbd_err(connection, "--dry-run is not supported by peer");
+ return -EOPNOTSUPP;
+ }
+
+ size = sizeof(*p);
+ if (connection->agreed_pro_version >= 87)
+ size += strlen(nc->integrity_alg) + 1;
+
+ p->protocol = cpu_to_be32(nc->wire_protocol);
+ p->after_sb_0p = cpu_to_be32(nc->after_sb_0p);
+ p->after_sb_1p = cpu_to_be32(nc->after_sb_1p);
+ p->after_sb_2p = cpu_to_be32(nc->after_sb_2p);
+ p->two_primaries = cpu_to_be32(nc->two_primaries);
+ cf = 0;
+ if (nc->discard_my_data)
+ cf |= CF_DISCARD_MY_DATA;
+ if (nc->tentative)
+ cf |= CF_DRY_RUN;
+ p->conn_flags = cpu_to_be32(cf);
+
+ if (connection->agreed_pro_version >= 87)
+ strcpy(p->integrity_alg, nc->integrity_alg);
+ rcu_read_unlock();
+
+ return __conn_send_command(connection, sock, cmd, size, NULL, 0);
+}
+
+int drbd_send_protocol(struct drbd_connection *connection)
+{
+ int err;
+
+ mutex_lock(&connection->data.mutex);
+ err = __drbd_send_protocol(connection, P_PROTOCOL);
+ mutex_unlock(&connection->data.mutex);
+
+ return err;
+}
+
+static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_uuids *p;
+ int i;
+
+ if (!get_ldev_if_state(device, D_NEGOTIATING))
+ return 0;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p) {
+ put_ldev(device);
+ return -EIO;
+ }
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ device->comm_bm_set = drbd_bm_total_weight(device);
+ p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
+ rcu_read_lock();
+ uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0;
+ rcu_read_unlock();
+ uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0;
+ uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+ p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+ put_ldev(device);
+ return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_uuids(struct drbd_peer_device *peer_device)
+{
+ return _drbd_send_uuids(peer_device, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
+{
+ return _drbd_send_uuids(peer_device, 8);
+}
+
+void drbd_print_uuids(struct drbd_device *device, const char *text)
+{
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ u64 *uuid = device->ldev->md.uuid;
+ drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
+ text,
+ (unsigned long long)uuid[UI_CURRENT],
+ (unsigned long long)uuid[UI_BITMAP],
+ (unsigned long long)uuid[UI_HISTORY_START],
+ (unsigned long long)uuid[UI_HISTORY_END]);
+ put_ldev(device);
+ } else {
+ drbd_info(device, "%s effective data uuid: %016llX\n",
+ text,
+ (unsigned long long)device->ed_uuid);
+ }
+}
+
+void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_rs_uuid *p;
+ u64 uuid;
+
+ D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
+
+ uuid = device->ldev->md.uuid[UI_BITMAP];
+ if (uuid && uuid != UUID_JUST_CREATED)
+ uuid = uuid + UUID_NEW_BM_OFFSET;
+ else
+ get_random_bytes(&uuid, sizeof(u64));
+ drbd_uuid_set(device, UI_BITMAP, uuid);
+ drbd_print_uuids(device, "updated sync UUID");
+ drbd_md_sync(device);
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (p) {
+ p->uuid = cpu_to_be64(uuid);
+ drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+ }
+}
+
+int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_sizes *p;
+ sector_t d_size, u_size;
+ int q_order_type;
+ unsigned int max_bio_size;
+
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ D_ASSERT(device, device->ldev->backing_bdev);
+ d_size = drbd_get_max_capacity(device->ldev);
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ q_order_type = drbd_queue_order_type(device);
+ max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
+ put_ldev(device);
+ } else {
+ d_size = 0;
+ u_size = 0;
+ q_order_type = QUEUE_ORDERED_NONE;
+ max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
+ }
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+
+ if (peer_device->connection->agreed_pro_version <= 94)
+ max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ else if (peer_device->connection->agreed_pro_version < 100)
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
+
+ p->d_size = cpu_to_be64(d_size);
+ p->u_size = cpu_to_be64(u_size);
+ p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev));
+ p->max_bio_size = cpu_to_be32(max_bio_size);
+ p->queue_order_type = cpu_to_be16(q_order_type);
+ p->dds_flags = cpu_to_be16(flags);
+ return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_current_state() - Sends the drbd state to the peer
+ * @peer_device: DRBD peer device.
+ */
+int drbd_send_current_state(struct drbd_peer_device *peer_device)
+{
+ struct drbd_socket *sock;
+ struct p_state *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */
+ return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @peer_device: DRBD peer device.
+ * @state: the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state)
+{
+ struct drbd_socket *sock;
+ struct p_state *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(state.i); /* Within the send mutex */
+ return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val)
+{
+ struct drbd_socket *sock;
+ struct p_req_state *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
+
+int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_packet cmd;
+ struct drbd_socket *sock;
+ struct p_req_state *p;
+
+ cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode)
+{
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+
+ sock = &peer_device->connection->meta;
+ p = drbd_prepare_command(peer_device, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+ }
+}
+
+void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode)
+{
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+ enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
+
+ sock = &connection->meta;
+ p = conn_prepare_command(connection, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+ }
+}
+
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+ BUG_ON(code & ~0xf);
+ p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+ p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+ BUG_ON(n & ~0x7);
+ p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+static int fill_bitmap_rle_bits(struct drbd_device *device,
+ struct p_compressed_bm *p,
+ unsigned int size,
+ struct bm_xfer_ctx *c)
+{
+ struct bitstream bs;
+ unsigned long plain_bits;
+ unsigned long tmp;
+ unsigned long rl;
+ unsigned len;
+ unsigned toggle;
+ int bits, use_rle;
+
+ /* may we use this feature? */
+ rcu_read_lock();
+ use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
+ rcu_read_unlock();
+ if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90)
+ return 0;
+
+ if (c->bit_offset >= c->bm_bits)
+ return 0; /* nothing to do. */
+
+ /* use at most thus many bytes */
+ bitstream_init(&bs, p->code, size, 0);
+ memset(p->code, 0, size);
+ /* plain bits covered in this code string */
+ plain_bits = 0;
+
+ /* p->encoding & 0x80 stores whether the first run length is set.
+ * bit offset is implicit.
+ * start with toggle == 2 to be able to tell the first iteration */
+ toggle = 2;
+
+ /* see how much plain bits we can stuff into one packet
+ * using RLE and VLI. */
+ do {
+ tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset)
+ : _drbd_bm_find_next(device, c->bit_offset);
+ if (tmp == -1UL)
+ tmp = c->bm_bits;
+ rl = tmp - c->bit_offset;
+
+ if (toggle == 2) { /* first iteration */
+ if (rl == 0) {
+ /* the first checked bit was set,
+ * store start value, */
+ dcbp_set_start(p, 1);
+ /* but skip encoding of zero run length */
+ toggle = !toggle;
+ continue;
+ }
+ dcbp_set_start(p, 0);
+ }
+
+ /* paranoia: catch zero runlength.
+ * can only happen if bitmap is modified while we scan it. */
+ if (rl == 0) {
+ drbd_err(device, "unexpected zero runlength while encoding bitmap "
+ "t:%u bo:%lu\n", toggle, c->bit_offset);
+ return -1;
+ }
+
+ bits = vli_encode_bits(&bs, rl);
+ if (bits == -ENOBUFS) /* buffer full */
+ break;
+ if (bits <= 0) {
+ drbd_err(device, "error while encoding bitmap: %d\n", bits);
+ return 0;
+ }
+
+ toggle = !toggle;
+ plain_bits += rl;
+ c->bit_offset = tmp;
+ } while (c->bit_offset < c->bm_bits);
+
+ len = bs.cur.b - p->code + !!bs.cur.bit;
+
+ if (plain_bits < (len << 3)) {
+ /* incompressible with this method.
+ * we need to rewind both word and bit position. */
+ c->bit_offset -= plain_bits;
+ bm_xfer_ctx_bit_to_word_offset(c);
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+ return 0;
+ }
+
+ /* RLE + VLI was able to compress it just fine.
+ * update c->word_offset. */
+ bm_xfer_ctx_bit_to_word_offset(c);
+
+ /* store pad_bits */
+ dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+ return len;
+}
+
+/**
+ * send_bitmap_rle_or_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c)
+{
+ struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+ unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+ struct p_compressed_bm *p = sock->sbuf + header_size;
+ int len, err;
+
+ len = fill_bitmap_rle_bits(device, p,
+ DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
+ if (len < 0)
+ return -EIO;
+
+ if (len) {
+ dcbp_set_code(p, RLE_VLI_Bits);
+ err = __send_command(first_peer_device(device)->connection, device->vnr, sock,
+ P_COMPRESSED_BITMAP, sizeof(*p) + len,
+ NULL, 0);
+ c->packets[0]++;
+ c->bytes[0] += header_size + sizeof(*p) + len;
+
+ if (c->bit_offset >= c->bm_bits)
+ len = 0; /* DONE */
+ } else {
+ /* was not compressible.
+ * send a buffer full of plain text bits instead. */
+ unsigned int data_size;
+ unsigned long num_words;
+ unsigned long *p = sock->sbuf + header_size;
+
+ data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ len = num_words * sizeof(*p);
+ if (len)
+ drbd_bm_get_lel(device, c->word_offset, num_words, p);
+ err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0);
+ c->word_offset += num_words;
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+ c->packets[1]++;
+ c->bytes[1] += header_size + len;
+
+ if (c->bit_offset > c->bm_bits)
+ c->bit_offset = c->bm_bits;
+ }
+ if (!err) {
+ if (len == 0) {
+ INFO_bm_xfer_stats(device, "send", c);
+ return 0;
+ } else
+ return 1;
+ }
+ return -EIO;
+}
+
+/* See the comment at receive_bitmap() */
+static int _drbd_send_bitmap(struct drbd_device *device)
+{
+ struct bm_xfer_ctx c;
+ int err;
+
+ if (!expect(device->bitmap))
+ return false;
+
+ if (get_ldev(device)) {
+ if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
+ drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
+ drbd_bm_set_all(device);
+ if (drbd_bm_write(device)) {
+ /* write_bm did fail! Leave full sync flag set in Meta P_DATA
+ * but otherwise process as per normal - need to tell other
+ * side that a full resync is required! */
+ drbd_err(device, "Failed to write bitmap to disk!\n");
+ } else {
+ drbd_md_clear_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ }
+ }
+ put_ldev(device);
+ }
+
+ c = (struct bm_xfer_ctx) {
+ .bm_bits = drbd_bm_bits(device),
+ .bm_words = drbd_bm_words(device),
+ };
+
+ do {
+ err = send_bitmap_rle_or_plain(device, &c);
+ } while (err > 0);
+
+ return err == 0;
+}
+
+int drbd_send_bitmap(struct drbd_device *device)
+{
+ struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+ int err = -1;
+
+ mutex_lock(&sock->mutex);
+ if (sock->socket)
+ err = !_drbd_send_bitmap(device);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
+{
+ struct drbd_socket *sock;
+ struct p_barrier_ack *p;
+
+ if (connection->cstate < C_WF_REPORT_PARAMS)
+ return;
+
+ sock = &connection->meta;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return;
+ p->barrier = barrier_nr;
+ p->set_size = cpu_to_be32(set_size);
+ conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @device: DRBD device.
+ * @cmd: Packet command code.
+ * @sector: sector, needs to be in big endian byte order
+ * @blksize: size in byte, needs to be in big endian byte order
+ * @block_id: Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ u64 sector, u32 blksize, u64 block_id)
+{
+ struct drbd_socket *sock;
+ struct p_block_ack *p;
+
+ if (peer_device->device->state.conn < C_CONNECTED)
+ return -EIO;
+
+ sock = &peer_device->connection->meta;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = sector;
+ p->block_id = block_id;
+ p->blksize = blksize;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
+void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct p_data *dp, int data_size)
+{
+ if (peer_device->connection->peer_integrity_tfm)
+ data_size -= crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ _drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size),
+ dp->block_id);
+}
+
+void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct p_block_req *rp)
+{
+ _drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @device: DRBD device
+ * @cmd: packet command code
+ * @peer_req: peer request
+ */
+int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
+{
+ return _drbd_send_ack(peer_device, cmd,
+ cpu_to_be64(peer_req->i.sector),
+ cpu_to_be32(peer_req->i.size),
+ peer_req->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ sector_t sector, int blksize, u64 block_id)
+{
+ return _drbd_send_ack(peer_device, cmd,
+ cpu_to_be64(sector),
+ cpu_to_be32(blksize),
+ cpu_to_be64(block_id));
+}
+
+int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
+ sector_t sector, int size, u64 block_id)
+{
+ struct drbd_socket *sock;
+ struct p_block_req *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = block_id;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size,
+ void *digest, int digest_size, enum drbd_packet cmd)
+{
+ struct drbd_socket *sock;
+ struct p_block_req *p;
+
+ /* FIXME: Put the digest into the preallocated socket buffer. */
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size);
+}
+
+int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+ struct drbd_socket *sock;
+ struct p_block_req *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
+}
+
+/* called on sndtimeo
+ * returns false if we should retry,
+ * true if we think connection is dead
+ */
+static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock)
+{
+ int drop_it;
+ /* long elapsed = (long)(jiffies - device->last_received); */
+
+ drop_it = connection->meta.socket == sock
+ || !connection->asender.task
+ || get_t_state(&connection->asender) != RUNNING
+ || connection->cstate < C_WF_REPORT_PARAMS;
+
+ if (drop_it)
+ return true;
+
+ drop_it = !--connection->ko_count;
+ if (!drop_it) {
+ drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+ current->comm, current->pid, connection->ko_count);
+ request_ping(connection);
+ }
+
+ return drop_it; /* && (device->state == R_PRIMARY) */;
+}
+
+static void drbd_update_congested(struct drbd_connection *connection)
+{
+ struct sock *sk = connection->data.socket->sk;
+ if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+ set_bit(NET_CONGESTED, &connection->flags);
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ * reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page,
+ int offset, size_t size, unsigned msg_flags)
+{
+ struct socket *socket;
+ void *addr;
+ int err;
+
+ socket = peer_device->connection->data.socket;
+ addr = kmap(page) + offset;
+ err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
+ kunmap(page);
+ if (!err)
+ peer_device->device->send_cnt += size >> 9;
+ return err;
+}
+
+static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page,
+ int offset, size_t size, unsigned msg_flags)
+{
+ struct socket *socket = peer_device->connection->data.socket;
+ mm_segment_t oldfs = get_fs();
+ int len = size;
+ int err = -EIO;
+
+ /* e.g. XFS meta- & log-data is in slab pages, which have a
+ * page_count of 0 and/or have PageSlab() set.
+ * we cannot use send_page for those, as that does get_page();
+ * put_page(); and would cause either a VM_BUG directly, or
+ * __page_cache_release a page that would actually still be referenced
+ * by someone, leading to some obscure delayed Oops somewhere else. */
+ if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
+ return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
+
+ msg_flags |= MSG_NOSIGNAL;
+ drbd_update_congested(peer_device->connection);
+ set_fs(KERNEL_DS);
+ do {
+ int sent;
+
+ sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
+ if (sent <= 0) {
+ if (sent == -EAGAIN) {
+ if (we_should_drop_the_connection(peer_device->connection, socket))
+ break;
+ continue;
+ }
+ drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
+ __func__, (int)size, len, sent);
+ if (sent < 0)
+ err = sent;
+ break;
+ }
+ len -= sent;
+ offset += sent;
+ } while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/);
+ set_fs(oldfs);
+ clear_bit(NET_CONGESTED, &peer_device->connection->flags);
+
+ if (len == 0) {
+ err = 0;
+ peer_device->device->send_cnt += size >> 9;
+ }
+ return err;
+}
+
+static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ /* hint all but last page with MSG_MORE */
+ bio_for_each_segment(bvec, bio, iter) {
+ int err;
+
+ err = _drbd_no_send_page(peer_device, bvec.bv_page,
+ bvec.bv_offset, bvec.bv_len,
+ bio_iter_last(bvec, iter)
+ ? 0 : MSG_MORE);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ /* hint all but last page with MSG_MORE */
+ bio_for_each_segment(bvec, bio, iter) {
+ int err;
+
+ err = _drbd_send_page(peer_device, bvec.bv_page,
+ bvec.bv_offset, bvec.bv_len,
+ bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
+ struct drbd_peer_request *peer_req)
+{
+ struct page *page = peer_req->pages;
+ unsigned len = peer_req->i.size;
+ int err;
+
+ /* hint all but last page with MSG_MORE */
+ page_chain_for_each(page) {
+ unsigned l = min_t(unsigned, len, PAGE_SIZE);
+
+ err = _drbd_send_page(peer_device, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+ if (err)
+ return err;
+ len -= l;
+ }
+ return 0;
+}
+
+static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw)
+{
+ if (connection->agreed_pro_version >= 95)
+ return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
+ (bi_rw & REQ_FUA ? DP_FUA : 0) |
+ (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
+ (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
+ else
+ return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
+}
+
+/* Used to send write or TRIM aka REQ_DISCARD requests
+ * R_PRIMARY -> Peer (P_DATA, P_TRIM)
+ */
+int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_data *p;
+ unsigned int dp_flags = 0;
+ int digest_size;
+ int err;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ digest_size = peer_device->connection->integrity_tfm ?
+ crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->block_id = (unsigned long)req;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
+ dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw);
+ if (device->state.conn >= C_SYNC_SOURCE &&
+ device->state.conn <= C_PAUSED_SYNC_T)
+ dp_flags |= DP_MAY_SET_IN_SYNC;
+ if (peer_device->connection->agreed_pro_version >= 100) {
+ if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ /* During resync, request an explicit write ack,
+ * even in protocol != C */
+ if (req->rq_state & RQ_EXP_WRITE_ACK
+ || (dp_flags & DP_MAY_SET_IN_SYNC))
+ dp_flags |= DP_SEND_WRITE_ACK;
+ }
+ p->dp_flags = cpu_to_be32(dp_flags);
+
+ if (dp_flags & DP_DISCARD) {
+ struct p_trim *t = (struct p_trim*)p;
+ t->size = cpu_to_be32(req->i.size);
+ err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
+ goto out;
+ }
+
+ /* our digest is still only over the payload.
+ * TRIM does not carry any payload. */
+ if (digest_size)
+ drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
+ err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size);
+ if (!err) {
+ /* For protocol A, we have to memcpy the payload into
+ * socket buffers, as we may complete right away
+ * as soon as we handed it over to tcp, at which point the data
+ * pages may become invalid.
+ *
+ * For data-integrity enabled, we copy it as well, so we can be
+ * sure that even if the bio pages may still be modified, it
+ * won't change the data on the wire, thus if the digest checks
+ * out ok after sending on this side, but does not fit on the
+ * receiving side, we sure have detected corruption elsewhere.
+ */
+ if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size)
+ err = _drbd_send_bio(peer_device, req->master_bio);
+ else
+ err = _drbd_send_zc_bio(peer_device, req->master_bio);
+
+ /* double check digest, sometimes buffers have been modified in flight. */
+ if (digest_size > 0 && digest_size <= 64) {
+ /* 64 byte, 512 bit, is the largest digest size
+ * currently supported in kernel crypto. */
+ unsigned char digest[64];
+ drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
+ if (memcmp(p + 1, digest, digest_size)) {
+ drbd_warn(device,
+ "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
+ (unsigned long long)req->i.sector, req->i.size);
+ }
+ } /* else if (digest_size > 64) {
+ ... Be noisy about digest too large ...
+ } */
+ }
+out:
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
+
+ return err;
+}
+
+/* answer packet, used to send data back for read requests:
+ * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
+ * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_data *p;
+ int err;
+ int digest_size;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+
+ digest_size = peer_device->connection->integrity_tfm ?
+ crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(peer_req->i.sector);
+ p->block_id = peer_req->block_id;
+ p->seq_num = 0; /* unused */
+ p->dp_flags = 0;
+ if (digest_size)
+ drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1);
+ err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size);
+ if (!err)
+ err = _drbd_send_zc_ee(peer_device, peer_req);
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
+
+ return err;
+}
+
+int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_socket *sock;
+ struct p_block_desc *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->blksize = cpu_to_be32(req->i.size);
+ return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
+}
+
+/*
+ drbd_send distinguishes two cases:
+
+ Packets sent via the data socket "sock"
+ and packets sent via the meta data socket "msock"
+
+ sock msock
+ -----------------+-------------------------+------------------------------
+ timeout conf.timeout / 2 conf.timeout / 2
+ timeout action send a ping via msock Abort communication
+ and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_connection *connection, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags)
+{
+ struct kvec iov;
+ struct msghdr msg;
+ int rv, sent = 0;
+
+ if (!sock)
+ return -EBADR;
+
+ /* THINK if (signal_pending) return ... ? */
+
+ iov.iov_base = buf;
+ iov.iov_len = size;
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+ if (sock == connection->data.socket) {
+ rcu_read_lock();
+ connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
+ rcu_read_unlock();
+ drbd_update_congested(connection);
+ }
+ do {
+ /* STRANGE
+ * tcp_sendmsg does _not_ use its size parameter at all ?
+ *
+ * -EAGAIN on timeout, -EINTR on signal.
+ */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+ rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+ if (rv == -EAGAIN) {
+ if (we_should_drop_the_connection(connection, sock))
+ break;
+ else
+ continue;
+ }
+ if (rv == -EINTR) {
+ flush_signals(current);
+ rv = 0;
+ }
+ if (rv < 0)
+ break;
+ sent += rv;
+ iov.iov_base += rv;
+ iov.iov_len -= rv;
+ } while (sent < size);
+
+ if (sock == connection->data.socket)
+ clear_bit(NET_CONGESTED, &connection->flags);
+
+ if (rv <= 0) {
+ if (rv != -EAGAIN) {
+ drbd_err(connection, "%s_sendmsg returned %d\n",
+ sock == connection->meta.socket ? "msock" : "sock",
+ rv);
+ conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+ } else
+ conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+ }
+
+ return sent;
+}
+
+/**
+ * drbd_send_all - Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer,
+ size_t size, unsigned msg_flags)
+{
+ int err;
+
+ err = drbd_send(connection, sock, buffer, size, msg_flags);
+ if (err < 0)
+ return err;
+ if (err != size)
+ return -EIO;
+ return 0;
+}
+
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct drbd_device *device = bdev->bd_disk->private_data;
+ unsigned long flags;
+ int rv = 0;
+
+ mutex_lock(&drbd_main_mutex);
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ /* to have a stable device->state.role
+ * and no race with updating open_cnt */
+
+ if (device->state.role != R_PRIMARY) {
+ if (mode & FMODE_WRITE)
+ rv = -EROFS;
+ else if (!allow_oos)
+ rv = -EMEDIUMTYPE;
+ }
+
+ if (!rv)
+ device->open_cnt++;
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ mutex_unlock(&drbd_main_mutex);
+
+ return rv;
+}
+
+static void drbd_release(struct gendisk *gd, fmode_t mode)
+{
+ struct drbd_device *device = gd->private_data;
+ mutex_lock(&drbd_main_mutex);
+ device->open_cnt--;
+ mutex_unlock(&drbd_main_mutex);
+}
+
+static void drbd_set_defaults(struct drbd_device *device)
+{
+ /* Beware! The actual layout differs
+ * between big endian and little endian */
+ device->state = (union drbd_dev_state) {
+ { .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = C_STANDALONE,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN,
+ } };
+}
+
+void drbd_init_set_defaults(struct drbd_device *device)
+{
+ /* the memset(,0,) did most of this.
+ * note: only assignments, no allocation in here */
+
+ drbd_set_defaults(device);
+
+ atomic_set(&device->ap_bio_cnt, 0);
+ atomic_set(&device->ap_actlog_cnt, 0);
+ atomic_set(&device->ap_pending_cnt, 0);
+ atomic_set(&device->rs_pending_cnt, 0);
+ atomic_set(&device->unacked_cnt, 0);
+ atomic_set(&device->local_cnt, 0);
+ atomic_set(&device->pp_in_use_by_net, 0);
+ atomic_set(&device->rs_sect_in, 0);
+ atomic_set(&device->rs_sect_ev, 0);
+ atomic_set(&device->ap_in_flight, 0);
+ atomic_set(&device->md_io.in_use, 0);
+
+ mutex_init(&device->own_state_mutex);
+ device->state_mutex = &device->own_state_mutex;
+
+ spin_lock_init(&device->al_lock);
+ spin_lock_init(&device->peer_seq_lock);
+
+ INIT_LIST_HEAD(&device->active_ee);
+ INIT_LIST_HEAD(&device->sync_ee);
+ INIT_LIST_HEAD(&device->done_ee);
+ INIT_LIST_HEAD(&device->read_ee);
+ INIT_LIST_HEAD(&device->net_ee);
+ INIT_LIST_HEAD(&device->resync_reads);
+ INIT_LIST_HEAD(&device->resync_work.list);
+ INIT_LIST_HEAD(&device->unplug_work.list);
+ INIT_LIST_HEAD(&device->bm_io_work.w.list);
+ INIT_LIST_HEAD(&device->pending_master_completion[0]);
+ INIT_LIST_HEAD(&device->pending_master_completion[1]);
+ INIT_LIST_HEAD(&device->pending_completion[0]);
+ INIT_LIST_HEAD(&device->pending_completion[1]);
+
+ device->resync_work.cb = w_resync_timer;
+ device->unplug_work.cb = w_send_write_hint;
+ device->bm_io_work.w.cb = w_bitmap_io;
+
+ init_timer(&device->resync_timer);
+ init_timer(&device->md_sync_timer);
+ init_timer(&device->start_resync_timer);
+ init_timer(&device->request_timer);
+ device->resync_timer.function = resync_timer_fn;
+ device->resync_timer.data = (unsigned long) device;
+ device->md_sync_timer.function = md_sync_timer_fn;
+ device->md_sync_timer.data = (unsigned long) device;
+ device->start_resync_timer.function = start_resync_timer_fn;
+ device->start_resync_timer.data = (unsigned long) device;
+ device->request_timer.function = request_timer_fn;
+ device->request_timer.data = (unsigned long) device;
+
+ init_waitqueue_head(&device->misc_wait);
+ init_waitqueue_head(&device->state_wait);
+ init_waitqueue_head(&device->ee_wait);
+ init_waitqueue_head(&device->al_wait);
+ init_waitqueue_head(&device->seq_wait);
+
+ device->resync_wenr = LC_FREE;
+ device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+ device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+}
+
+void drbd_device_cleanup(struct drbd_device *device)
+{
+ int i;
+ if (first_peer_device(device)->connection->receiver.t_state != NONE)
+ drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+ first_peer_device(device)->connection->receiver.t_state);
+
+ device->al_writ_cnt =
+ device->bm_writ_cnt =
+ device->read_cnt =
+ device->recv_cnt =
+ device->send_cnt =
+ device->writ_cnt =
+ device->p_size =
+ device->rs_start =
+ device->rs_total =
+ device->rs_failed = 0;
+ device->rs_last_events = 0;
+ device->rs_last_sect_ev = 0;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = 0;
+ device->rs_mark_time[i] = 0;
+ }
+ D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
+
+ drbd_set_my_capacity(device, 0);
+ if (device->bitmap) {
+ /* maybe never allocated. */
+ drbd_bm_resize(device, 0, 1);
+ drbd_bm_cleanup(device);
+ }
+
+ drbd_free_ldev(device->ldev);
+ device->ldev = NULL;
+
+ clear_bit(AL_SUSPENDED, &device->flags);
+
+ D_ASSERT(device, list_empty(&device->active_ee));
+ D_ASSERT(device, list_empty(&device->sync_ee));
+ D_ASSERT(device, list_empty(&device->done_ee));
+ D_ASSERT(device, list_empty(&device->read_ee));
+ D_ASSERT(device, list_empty(&device->net_ee));
+ D_ASSERT(device, list_empty(&device->resync_reads));
+ D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
+ D_ASSERT(device, list_empty(&device->resync_work.list));
+ D_ASSERT(device, list_empty(&device->unplug_work.list));
+
+ drbd_set_defaults(device);
+}
+
+
+static void drbd_destroy_mempools(void)
+{
+ struct page *page;
+
+ while (drbd_pp_pool) {
+ page = drbd_pp_pool;
+ drbd_pp_pool = (struct page *)page_private(page);
+ __free_page(page);
+ drbd_pp_vacant--;
+ }
+
+ /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
+
+ if (drbd_md_io_bio_set)
+ bioset_free(drbd_md_io_bio_set);
+ if (drbd_md_io_page_pool)
+ mempool_destroy(drbd_md_io_page_pool);
+ if (drbd_ee_mempool)
+ mempool_destroy(drbd_ee_mempool);
+ if (drbd_request_mempool)
+ mempool_destroy(drbd_request_mempool);
+ if (drbd_ee_cache)
+ kmem_cache_destroy(drbd_ee_cache);
+ if (drbd_request_cache)
+ kmem_cache_destroy(drbd_request_cache);
+ if (drbd_bm_ext_cache)
+ kmem_cache_destroy(drbd_bm_ext_cache);
+ if (drbd_al_ext_cache)
+ kmem_cache_destroy(drbd_al_ext_cache);
+
+ drbd_md_io_bio_set = NULL;
+ drbd_md_io_page_pool = NULL;
+ drbd_ee_mempool = NULL;
+ drbd_request_mempool = NULL;
+ drbd_ee_cache = NULL;
+ drbd_request_cache = NULL;
+ drbd_bm_ext_cache = NULL;
+ drbd_al_ext_cache = NULL;
+
+ return;
+}
+
+static int drbd_create_mempools(void)
+{
+ struct page *page;
+ const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
+ int i;
+
+ /* prepare our caches and mempools */
+ drbd_request_mempool = NULL;
+ drbd_ee_cache = NULL;
+ drbd_request_cache = NULL;
+ drbd_bm_ext_cache = NULL;
+ drbd_al_ext_cache = NULL;
+ drbd_pp_pool = NULL;
+ drbd_md_io_page_pool = NULL;
+ drbd_md_io_bio_set = NULL;
+
+ /* caches */
+ drbd_request_cache = kmem_cache_create(
+ "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+ if (drbd_request_cache == NULL)
+ goto Enomem;
+
+ drbd_ee_cache = kmem_cache_create(
+ "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
+ if (drbd_ee_cache == NULL)
+ goto Enomem;
+
+ drbd_bm_ext_cache = kmem_cache_create(
+ "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+ if (drbd_bm_ext_cache == NULL)
+ goto Enomem;
+
+ drbd_al_ext_cache = kmem_cache_create(
+ "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+ if (drbd_al_ext_cache == NULL)
+ goto Enomem;
+
+ /* mempools */
+ drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_bio_set == NULL)
+ goto Enomem;
+
+ drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_page_pool == NULL)
+ goto Enomem;
+
+ drbd_request_mempool = mempool_create_slab_pool(number,
+ drbd_request_cache);
+ if (drbd_request_mempool == NULL)
+ goto Enomem;
+
+ drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache);
+ if (drbd_ee_mempool == NULL)
+ goto Enomem;
+
+ /* drbd's page pool */
+ spin_lock_init(&drbd_pp_lock);
+
+ for (i = 0; i < number; i++) {
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page)
+ goto Enomem;
+ set_page_private(page, (unsigned long)drbd_pp_pool);
+ drbd_pp_pool = page;
+ }
+ drbd_pp_vacant = number;
+
+ return 0;
+
+Enomem:
+ drbd_destroy_mempools(); /* in case we allocated some */
+ return -ENOMEM;
+}
+
+static void drbd_release_all_peer_reqs(struct drbd_device *device)
+{
+ int rr;
+
+ rr = drbd_free_peer_reqs(device, &device->active_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in active list found!\n", rr);
+
+ rr = drbd_free_peer_reqs(device, &device->sync_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in sync list found!\n", rr);
+
+ rr = drbd_free_peer_reqs(device, &device->read_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in read list found!\n", rr);
+
+ rr = drbd_free_peer_reqs(device, &device->done_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in done list found!\n", rr);
+
+ rr = drbd_free_peer_reqs(device, &device->net_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking. */
+void drbd_destroy_device(struct kref *kref)
+{
+ struct drbd_device *device = container_of(kref, struct drbd_device, kref);
+ struct drbd_resource *resource = device->resource;
+ struct drbd_peer_device *peer_device, *tmp_peer_device;
+
+ del_timer_sync(&device->request_timer);
+
+ /* paranoia asserts */
+ D_ASSERT(device, device->open_cnt == 0);
+ /* end paranoia asserts */
+
+ /* cleanup stuff that may have been allocated during
+ * device (re-)configuration or state changes */
+
+ if (device->this_bdev)
+ bdput(device->this_bdev);
+
+ drbd_free_ldev(device->ldev);
+ device->ldev = NULL;
+
+ drbd_release_all_peer_reqs(device);
+
+ lc_destroy(device->act_log);
+ lc_destroy(device->resync);
+
+ kfree(device->p_uuid);
+ /* device->p_uuid = NULL; */
+
+ if (device->bitmap) /* should no longer be there. */
+ drbd_bm_cleanup(device);
+ __free_page(device->md_io.page);
+ put_disk(device->vdisk);
+ blk_cleanup_queue(device->rq_queue);
+ kfree(device->rs_plan_s);
+
+ /* not for_each_connection(connection, resource):
+ * those may have been cleaned up and disassociated already.
+ */
+ for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+ kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+ kfree(peer_device);
+ }
+ memset(device, 0xfd, sizeof(*device));
+ kfree(device);
+ kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+ struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+ LIST_HEAD(writes);
+ struct drbd_request *req, *tmp;
+
+ spin_lock_irq(&retry->lock);
+ list_splice_init(&retry->writes, &writes);
+ spin_unlock_irq(&retry->lock);
+
+ list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+ struct drbd_device *device = req->device;
+ struct bio *bio = req->master_bio;
+ unsigned long start_jif = req->start_jif;
+ bool expected;
+
+ expected =
+ expect(atomic_read(&req->completion_ref) == 0) &&
+ expect(req->rq_state & RQ_POSTPONED) &&
+ expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+ (req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+ if (!expected)
+ drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
+ req, atomic_read(&req->completion_ref),
+ req->rq_state);
+
+ /* We still need to put one kref associated with the
+ * "completion_ref" going zero in the code path that queued it
+ * here. The request object may still be referenced by a
+ * frozen local req->private_bio, in case we force-detached.
+ */
+ kref_put(&req->kref, drbd_req_destroy);
+
+ /* A single suspended or otherwise blocking device may stall
+ * all others as well. Fortunately, this code path is to
+ * recover from a situation that "should not happen":
+ * concurrent writes in multi-primary setup.
+ * In a "normal" lifecycle, this workqueue is supposed to be
+ * destroyed without ever doing anything.
+ * If it turns out to be an issue anyways, we can do per
+ * resource (replication group) or per device (minor) retry
+ * workqueues instead.
+ */
+
+ /* We are not just doing generic_make_request(),
+ * as we want to keep the start_time information. */
+ inc_ap_bio(device);
+ __drbd_make_request(device, bio, start_jif);
+ }
+}
+
+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
+void drbd_restart_request(struct drbd_request *req)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&retry.lock, flags);
+ list_move_tail(&req->tl_requests, &retry.writes);
+ spin_unlock_irqrestore(&retry.lock, flags);
+
+ /* Drop the extra reference that would otherwise
+ * have been dropped by complete_master_bio.
+ * do_retry() needs to grab a new one. */
+ dec_ap_bio(req->device);
+
+ queue_work(retry.wq, &retry.worker);
+}
+
+void drbd_destroy_resource(struct kref *kref)
+{
+ struct drbd_resource *resource =
+ container_of(kref, struct drbd_resource, kref);
+
+ idr_destroy(&resource->devices);
+ free_cpumask_var(resource->cpu_mask);
+ kfree(resource->name);
+ memset(resource, 0xf2, sizeof(*resource));
+ kfree(resource);
+}
+
+void drbd_free_resource(struct drbd_resource *resource)
+{
+ struct drbd_connection *connection, *tmp;
+
+ for_each_connection_safe(connection, tmp, resource) {
+ list_del(&connection->connections);
+ drbd_debugfs_connection_cleanup(connection);
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
+ drbd_debugfs_resource_cleanup(resource);
+ kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static void drbd_cleanup(void)
+{
+ unsigned int i;
+ struct drbd_device *device;
+ struct drbd_resource *resource, *tmp;
+
+ /* first remove proc,
+ * drbdsetup uses it's presence to detect
+ * whether DRBD is loaded.
+ * If we would get stuck in proc removal,
+ * but have netlink already deregistered,
+ * some drbdsetup commands may wait forever
+ * for an answer.
+ */
+ if (drbd_proc)
+ remove_proc_entry("drbd", NULL);
+
+ if (retry.wq)
+ destroy_workqueue(retry.wq);
+
+ drbd_genl_unregister();
+ drbd_debugfs_cleanup();
+
+ idr_for_each_entry(&drbd_devices, device, i)
+ drbd_delete_device(device);
+
+ /* not _rcu since, no other updater anymore. Genl already unregistered */
+ for_each_resource_safe(resource, tmp, &drbd_resources) {
+ list_del(&resource->resources);
+ drbd_free_resource(resource);
+ }
+
+ drbd_destroy_mempools();
+ unregister_blkdev(DRBD_MAJOR, "drbd");
+
+ idr_destroy(&drbd_devices);
+
+ pr_info("module cleanup done.\n");
+}
+
+/**
+ * drbd_congested() - Callback for the flusher thread
+ * @congested_data: User data
+ * @bdi_bits: Bits the BDI flusher thread is currently interested in
+ *
+ * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ */
+static int drbd_congested(void *congested_data, int bdi_bits)
+{
+ struct drbd_device *device = congested_data;
+ struct request_queue *q;
+ char reason = '-';
+ int r = 0;
+
+ if (!may_inc_ap_bio(device)) {
+ /* DRBD has frozen IO */
+ r = bdi_bits;
+ reason = 'd';
+ goto out;
+ }
+
+ if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
+ r |= (1 << BDI_async_congested);
+ /* Without good local data, we would need to read from remote,
+ * and that would need the worker thread as well, which is
+ * currently blocked waiting for that usermode helper to
+ * finish.
+ */
+ if (!get_ldev_if_state(device, D_UP_TO_DATE))
+ r |= (1 << BDI_sync_congested);
+ else
+ put_ldev(device);
+ r &= bdi_bits;
+ reason = 'c';
+ goto out;
+ }
+
+ if (get_ldev(device)) {
+ q = bdev_get_queue(device->ldev->backing_bdev);
+ r = bdi_congested(&q->backing_dev_info, bdi_bits);
+ put_ldev(device);
+ if (r)
+ reason = 'b';
+ }
+
+ if (bdi_bits & (1 << BDI_async_congested) &&
+ test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
+ r |= (1 << BDI_async_congested);
+ reason = reason == 'b' ? 'a' : 'n';
+ }
+
+out:
+ device->congestion_reason = reason;
+ return r;
+}
+
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
+{
+ spin_lock_init(&wq->q_lock);
+ INIT_LIST_HEAD(&wq->q);
+ init_waitqueue_head(&wq->q_wait);
+}
+
+struct completion_work {
+ struct drbd_work w;
+ struct completion done;
+};
+
+static int w_complete(struct drbd_work *w, int cancel)
+{
+ struct completion_work *completion_work =
+ container_of(w, struct completion_work, w);
+
+ complete(&completion_work->done);
+ return 0;
+}
+
+void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
+{
+ struct completion_work completion_work;
+
+ completion_work.w.cb = w_complete;
+ init_completion(&completion_work.done);
+ drbd_queue_work(work_queue, &completion_work.w);
+ wait_for_completion(&completion_work.done);
+}
+
+struct drbd_resource *drbd_find_resource(const char *name)
+{
+ struct drbd_resource *resource;
+
+ if (!name || !name[0])
+ return NULL;
+
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ if (!strcmp(resource->name, name)) {
+ kref_get(&resource->kref);
+ goto found;
+ }
+ }
+ resource = NULL;
+found:
+ rcu_read_unlock();
+ return resource;
+}
+
+struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len)
+{
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ for_each_connection_rcu(connection, resource) {
+ if (connection->my_addr_len == my_addr_len &&
+ connection->peer_addr_len == peer_addr_len &&
+ !memcmp(&connection->my_addr, my_addr, my_addr_len) &&
+ !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) {
+ kref_get(&connection->kref);
+ goto found;
+ }
+ }
+ }
+ connection = NULL;
+found:
+ rcu_read_unlock();
+ return connection;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+ socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->rbuf)
+ return -ENOMEM;
+ socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->sbuf)
+ return -ENOMEM;
+ return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+ free_page((unsigned long) socket->sbuf);
+ free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_connection *connection)
+{
+ drbd_free_sock(connection);
+
+ crypto_free_hash(connection->csums_tfm);
+ crypto_free_hash(connection->verify_tfm);
+ crypto_free_hash(connection->cram_hmac_tfm);
+ crypto_free_hash(connection->integrity_tfm);
+ crypto_free_hash(connection->peer_integrity_tfm);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+
+ connection->csums_tfm = NULL;
+ connection->verify_tfm = NULL;
+ connection->cram_hmac_tfm = NULL;
+ connection->integrity_tfm = NULL;
+ connection->peer_integrity_tfm = NULL;
+ connection->int_dig_in = NULL;
+ connection->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts)
+{
+ struct drbd_connection *connection;
+ cpumask_var_t new_cpu_mask;
+ int err;
+
+ if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /* silently ignore cpu mask on UP kernel */
+ if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+ err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
+ cpumask_bits(new_cpu_mask), nr_cpu_ids);
+ if (err == -EOVERFLOW) {
+ /* So what. mask it out. */
+ cpumask_var_t tmp_cpu_mask;
+ if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+ cpumask_setall(tmp_cpu_mask);
+ cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+ drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+ res_opts->cpu_mask,
+ strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+ nr_cpu_ids);
+ free_cpumask_var(tmp_cpu_mask);
+ err = 0;
+ }
+ }
+ if (err) {
+ drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
+ /* retcode = ERR_CPU_MASK_PARSE; */
+ goto fail;
+ }
+ }
+ resource->res_opts = *res_opts;
+ if (cpumask_empty(new_cpu_mask))
+ drbd_calc_cpu_mask(&new_cpu_mask);
+ if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) {
+ cpumask_copy(resource->cpu_mask, new_cpu_mask);
+ for_each_connection_rcu(connection, resource) {
+ connection->receiver.reset_cpu_mask = 1;
+ connection->asender.reset_cpu_mask = 1;
+ connection->worker.reset_cpu_mask = 1;
+ }
+ }
+ err = 0;
+
+fail:
+ free_cpumask_var(new_cpu_mask);
+ return err;
+
+}
+
+struct drbd_resource *drbd_create_resource(const char *name)
+{
+ struct drbd_resource *resource;
+
+ resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL);
+ if (!resource)
+ goto fail;
+ resource->name = kstrdup(name, GFP_KERNEL);
+ if (!resource->name)
+ goto fail_free_resource;
+ if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL))
+ goto fail_free_name;
+ kref_init(&resource->kref);
+ idr_init(&resource->devices);
+ INIT_LIST_HEAD(&resource->connections);
+ resource->write_ordering = WO_bdev_flush;
+ list_add_tail_rcu(&resource->resources, &drbd_resources);
+ mutex_init(&resource->conf_update);
+ mutex_init(&resource->adm_mutex);
+ spin_lock_init(&resource->req_lock);
+ drbd_debugfs_resource_add(resource);
+ return resource;
+
+fail_free_name:
+ kfree(resource->name);
+fail_free_resource:
+ kfree(resource);
+fail:
+ return NULL;
+}
+
+/* caller must be under adm_mutex */
+struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
+{
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+
+ connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL);
+ if (!connection)
+ return NULL;
+
+ if (drbd_alloc_socket(&connection->data))
+ goto fail;
+ if (drbd_alloc_socket(&connection->meta))
+ goto fail;
+
+ connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+ if (!connection->current_epoch)
+ goto fail;
+
+ INIT_LIST_HEAD(&connection->transfer_log);
+
+ INIT_LIST_HEAD(&connection->current_epoch->list);
+ connection->epochs = 1;
+ spin_lock_init(&connection->epoch_lock);
+
+ connection->send.seen_any_write_yet = false;
+ connection->send.current_epoch_nr = 0;
+ connection->send.current_epoch_writes = 0;
+
+ resource = drbd_create_resource(name);
+ if (!resource)
+ goto fail;
+
+ connection->cstate = C_STANDALONE;
+ mutex_init(&connection->cstate_mutex);
+ init_waitqueue_head(&connection->ping_wait);
+ idr_init(&connection->peer_devices);
+
+ drbd_init_workqueue(&connection->sender_work);
+ mutex_init(&connection->data.mutex);
+ mutex_init(&connection->meta.mutex);
+
+ drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver");
+ connection->receiver.connection = connection;
+ drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
+ connection->worker.connection = connection;
+ drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
+ connection->asender.connection = connection;
+
+ kref_init(&connection->kref);
+
+ connection->resource = resource;
+
+ if (set_resource_options(resource, res_opts))
+ goto fail_resource;
+
+ kref_get(&resource->kref);
+ list_add_tail_rcu(&connection->connections, &resource->connections);
+ drbd_debugfs_connection_add(connection);
+ return connection;
+
+fail_resource:
+ list_del(&resource->resources);
+ drbd_free_resource(resource);
+fail:
+ kfree(connection->current_epoch);
+ drbd_free_socket(&connection->meta);
+ drbd_free_socket(&connection->data);
+ kfree(connection);
+ return NULL;
+}
+
+void drbd_destroy_connection(struct kref *kref)
+{
+ struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref);
+ struct drbd_resource *resource = connection->resource;
+
+ if (atomic_read(&connection->current_epoch->epoch_size) != 0)
+ drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
+ kfree(connection->current_epoch);
+
+ idr_destroy(&connection->peer_devices);
+
+ drbd_free_socket(&connection->meta);
+ drbd_free_socket(&connection->data);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+ memset(connection, 0xfc, sizeof(*connection));
+ kfree(connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static int init_submitter(struct drbd_device *device)
+{
+ /* opencoded create_singlethread_workqueue(),
+ * to be able to say "drbd%d", ..., minor */
+ device->submit.wq = alloc_workqueue("drbd%u_submit",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+ if (!device->submit.wq)
+ return -ENOMEM;
+
+ INIT_WORK(&device->submit.worker, do_submit);
+ INIT_LIST_HEAD(&device->submit.writes);
+ return 0;
+}
+
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
+{
+ struct drbd_resource *resource = adm_ctx->resource;
+ struct drbd_connection *connection;
+ struct drbd_device *device;
+ struct drbd_peer_device *peer_device, *tmp_peer_device;
+ struct gendisk *disk;
+ struct request_queue *q;
+ int id;
+ int vnr = adm_ctx->volume;
+ enum drbd_ret_code err = ERR_NOMEM;
+
+ device = minor_to_device(minor);
+ if (device)
+ return ERR_MINOR_OR_VOLUME_EXISTS;
+
+ /* GFP_KERNEL, we are outside of all write-out paths */
+ device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
+ if (!device)
+ return ERR_NOMEM;
+ kref_init(&device->kref);
+
+ kref_get(&resource->kref);
+ device->resource = resource;
+ device->minor = minor;
+ device->vnr = vnr;
+
+ drbd_init_set_defaults(device);
+
+ q = blk_alloc_queue(GFP_KERNEL);
+ if (!q)
+ goto out_no_q;
+ device->rq_queue = q;
+ q->queuedata = device;
+
+ disk = alloc_disk(1);
+ if (!disk)
+ goto out_no_disk;
+ device->vdisk = disk;
+
+ set_disk_ro(disk, true);
+
+ disk->queue = q;
+ disk->major = DRBD_MAJOR;
+ disk->first_minor = minor;
+ disk->fops = &drbd_ops;
+ sprintf(disk->disk_name, "drbd%d", minor);
+ disk->private_data = device;
+
+ device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+ /* we have no partitions. we contain only ourselves. */
+ device->this_bdev->bd_contains = device->this_bdev;
+
+ q->backing_dev_info.congested_fn = drbd_congested;
+ q->backing_dev_info.congested_data = device;
+
+ blk_queue_make_request(q, drbd_make_request);
+ blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+ /* Setting the max_hw_sectors to an odd value of 8kibyte here
+ This triggers a max_bio_size message upon first attach or connect */
+ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
+ blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+ blk_queue_merge_bvec(q, drbd_merge_bvec);
+ q->queue_lock = &resource->req_lock;
+
+ device->md_io.page = alloc_page(GFP_KERNEL);
+ if (!device->md_io.page)
+ goto out_no_io_page;
+
+ if (drbd_bm_init(device))
+ goto out_no_bitmap;
+ device->read_requests = RB_ROOT;
+ device->write_requests = RB_ROOT;
+
+ id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC)
+ err = ERR_MINOR_OR_VOLUME_EXISTS;
+ goto out_no_minor_idr;
+ }
+ kref_get(&device->kref);
+
+ id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC)
+ err = ERR_MINOR_OR_VOLUME_EXISTS;
+ goto out_idr_remove_minor;
+ }
+ kref_get(&device->kref);
+
+ INIT_LIST_HEAD(&device->peer_devices);
+ INIT_LIST_HEAD(&device->pending_bitmap_io);
+ for_each_connection(connection, resource) {
+ peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
+ if (!peer_device)
+ goto out_idr_remove_from_resource;
+ peer_device->connection = connection;
+ peer_device->device = device;
+
+ list_add(&peer_device->peer_devices, &device->peer_devices);
+ kref_get(&device->kref);
+
+ id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC)
+ err = ERR_INVALID_REQUEST;
+ goto out_idr_remove_from_resource;
+ }
+ kref_get(&connection->kref);
+ }
+
+ if (init_submitter(device)) {
+ err = ERR_NOMEM;
+ goto out_idr_remove_vol;
+ }
+
+ add_disk(disk);
+
+ /* inherit the connection state */
+ device->state.conn = first_connection(resource)->cstate;
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ for_each_peer_device(peer_device, device)
+ drbd_connected(peer_device);
+ }
+ /* move to create_peer_device() */
+ for_each_peer_device(peer_device, device)
+ drbd_debugfs_peer_device_add(peer_device);
+ drbd_debugfs_device_add(device);
+ return NO_ERROR;
+
+out_idr_remove_vol:
+ idr_remove(&connection->peer_devices, vnr);
+out_idr_remove_from_resource:
+ for_each_connection(connection, resource) {
+ peer_device = idr_find(&connection->peer_devices, vnr);
+ if (peer_device) {
+ idr_remove(&connection->peer_devices, vnr);
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
+ }
+ for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+ list_del(&peer_device->peer_devices);
+ kfree(peer_device);
+ }
+ idr_remove(&resource->devices, vnr);
+out_idr_remove_minor:
+ idr_remove(&drbd_devices, minor);
+ synchronize_rcu();
+out_no_minor_idr:
+ drbd_bm_cleanup(device);
+out_no_bitmap:
+ __free_page(device->md_io.page);
+out_no_io_page:
+ put_disk(disk);
+out_no_disk:
+ blk_cleanup_queue(q);
+out_no_q:
+ kref_put(&resource->kref, drbd_destroy_resource);
+ kfree(device);
+ return err;
+}
+
+void drbd_delete_device(struct drbd_device *device)
+{
+ struct drbd_resource *resource = device->resource;
+ struct drbd_connection *connection;
+ struct drbd_peer_device *peer_device;
+ int refs = 3;
+
+ /* move to free_peer_device() */
+ for_each_peer_device(peer_device, device)
+ drbd_debugfs_peer_device_cleanup(peer_device);
+ drbd_debugfs_device_cleanup(device);
+ for_each_connection(connection, resource) {
+ idr_remove(&connection->peer_devices, device->vnr);
+ refs++;
+ }
+ idr_remove(&resource->devices, device->vnr);
+ idr_remove(&drbd_devices, device_to_minor(device));
+ del_gendisk(device->vdisk);
+ synchronize_rcu();
+ kref_sub(&device->kref, refs, drbd_destroy_device);
+}
+
+static int __init drbd_init(void)
+{
+ int err;
+
+ if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
+ pr_err("invalid minor_count (%d)\n", minor_count);
+#ifdef MODULE
+ return -EINVAL;
+#else
+ minor_count = DRBD_MINOR_COUNT_DEF;
+#endif
+ }
+
+ err = register_blkdev(DRBD_MAJOR, "drbd");
+ if (err) {
+ pr_err("unable to register block device major %d\n",
+ DRBD_MAJOR);
+ return err;
+ }
+
+ /*
+ * allocate all necessary structs
+ */
+ init_waitqueue_head(&drbd_pp_wait);
+
+ drbd_proc = NULL; /* play safe for drbd_cleanup */
+ idr_init(&drbd_devices);
+
+ rwlock_init(&global_state_lock);
+ INIT_LIST_HEAD(&drbd_resources);
+
+ err = drbd_genl_register();
+ if (err) {
+ pr_err("unable to register generic netlink family\n");
+ goto fail;
+ }
+
+ err = drbd_create_mempools();
+ if (err)
+ goto fail;
+
+ err = -ENOMEM;
+ drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
+ if (!drbd_proc) {
+ pr_err("unable to register proc file\n");
+ goto fail;
+ }
+
+ retry.wq = create_singlethread_workqueue("drbd-reissue");
+ if (!retry.wq) {
+ pr_err("unable to create retry workqueue\n");
+ goto fail;
+ }
+ INIT_WORK(&retry.worker, do_retry);
+ spin_lock_init(&retry.lock);
+ INIT_LIST_HEAD(&retry.writes);
+
+ if (drbd_debugfs_init())
+ pr_notice("failed to initialize debugfs -- will not be available\n");
+
+ pr_info("initialized. "
+ "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+ API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+ pr_info("%s\n", drbd_buildtag());
+ pr_info("registered as block device major %d\n", DRBD_MAJOR);
+ return 0; /* Success! */
+
+fail:
+ drbd_cleanup();
+ if (err == -ENOMEM)
+ pr_err("ran out of memory\n");
+ else
+ pr_err("initialization failure\n");
+ return err;
+}
+
+void drbd_free_ldev(struct drbd_backing_dev *ldev)
+{
+ if (ldev == NULL)
+ return;
+
+ blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+ kfree(ldev->disk_conf);
+ kfree(ldev);
+}
+
+static void drbd_free_one_sock(struct drbd_socket *ds)
+{
+ struct socket *s;
+ mutex_lock(&ds->mutex);
+ s = ds->socket;
+ ds->socket = NULL;
+ mutex_unlock(&ds->mutex);
+ if (s) {
+ /* so debugfs does not need to mutex_lock() */
+ synchronize_rcu();
+ kernel_sock_shutdown(s, SHUT_RDWR);
+ sock_release(s);
+ }
+}
+
+void drbd_free_sock(struct drbd_connection *connection)
+{
+ if (connection->data.socket)
+ drbd_free_one_sock(&connection->data);
+ if (connection->meta.socket)
+ drbd_free_one_sock(&connection->meta);
+}
+
+/* meta data management */
+
+void conn_md_sync(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_md_sync(device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+/* aligned 4kByte */
+struct meta_data_on_disk {
+ u64 la_size_sect; /* last agreed size. */
+ u64 uuid[UI_SIZE]; /* UUIDs. */
+ u64 device_uuid;
+ u64 reserved_u64_1;
+ u32 flags; /* MDF */
+ u32 magic;
+ u32 md_size_sect;
+ u32 al_offset; /* offset to this block */
+ u32 al_nr_extents; /* important for restoring the AL (userspace) */
+ /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
+ u32 bm_offset; /* offset to the bitmap, from here */
+ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
+ u32 la_peer_max_bio_size; /* last peer max_bio_size */
+
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+
+ u8 reserved_u8[4096 - (7*8 + 10*4)];
+} __packed;
+
+
+
+void drbd_md_write(struct drbd_device *device, void *b)
+{
+ struct meta_data_on_disk *buffer = b;
+ sector_t sector;
+ int i;
+
+ memset(buffer, 0, sizeof(*buffer));
+
+ buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev));
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+ buffer->flags = cpu_to_be32(device->ldev->md.flags);
+ buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
+
+ buffer->md_size_sect = cpu_to_be32(device->ldev->md.md_size_sect);
+ buffer->al_offset = cpu_to_be32(device->ldev->md.al_offset);
+ buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
+ buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+ buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
+
+ buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
+ buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
+
+ buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
+ buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
+
+ D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
+ sector = device->ldev->md.md_offset;
+
+ if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+ /* this was a try anyways ... */
+ drbd_err(device, "meta data update failed!\n");
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ }
+}
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @device: DRBD device.
+ */
+void drbd_md_sync(struct drbd_device *device)
+{
+ struct meta_data_on_disk *buffer;
+
+ /* Don't accidentally change the DRBD meta data layout. */
+ BUILD_BUG_ON(UI_SIZE != 4);
+ BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
+ del_timer(&device->md_sync_timer);
+ /* timer may be rearmed by drbd_md_mark_dirty() now. */
+ if (!test_and_clear_bit(MD_DIRTY, &device->flags))
+ return;
+
+ /* We use here D_FAILED and not D_ATTACHING because we try to write
+ * metadata even if we detach due to a disk failure! */
+ if (!get_ldev_if_state(device, D_FAILED))
+ return;
+
+ buffer = drbd_md_get_buffer(device, __func__);
+ if (!buffer)
+ goto out;
+
+ drbd_md_write(device, buffer);
+
+ /* Update device->ldev->md.la_size_sect,
+ * since we updated it on metadata. */
+ device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev);
+
+ drbd_md_put_buffer(device);
+out:
+ put_ldev(device);
+}
+
+static int check_activity_log_stripe_size(struct drbd_device *device,
+ struct meta_data_on_disk *on_disk,
+ struct drbd_md *in_core)
+{
+ u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+ u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+ u64 al_size_4k;
+
+ /* both not set: default to old fixed size activity log */
+ if (al_stripes == 0 && al_stripe_size_4k == 0) {
+ al_stripes = 1;
+ al_stripe_size_4k = MD_32kB_SECT/8;
+ }
+
+ /* some paranoia plausibility checks */
+
+ /* we need both values to be set */
+ if (al_stripes == 0 || al_stripe_size_4k == 0)
+ goto err;
+
+ al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+ /* Upper limit of activity log area, to avoid potential overflow
+ * problems in al_tr_number_to_on_disk_sector(). As right now, more
+ * than 72 * 4k blocks total only increases the amount of history,
+ * limiting this arbitrarily to 16 GB is not a real limitation ;-) */
+ if (al_size_4k > (16 * 1024 * 1024/4))
+ goto err;
+
+ /* Lower limit: we need at least 8 transaction slots (32kB)
+ * to not break existing setups */
+ if (al_size_4k < MD_32kB_SECT/8)
+ goto err;
+
+ in_core->al_stripe_size_4k = al_stripe_size_4k;
+ in_core->al_stripes = al_stripes;
+ in_core->al_size_4k = al_size_4k;
+
+ return 0;
+err:
+ drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+ al_stripes, al_stripe_size_4k);
+ return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+ sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+ struct drbd_md *in_core = &bdev->md;
+ s32 on_disk_al_sect;
+ s32 on_disk_bm_sect;
+
+ /* The on-disk size of the activity log, calculated from offsets, and
+ * the size of the activity log calculated from the stripe settings,
+ * should match.
+ * Though we could relax this a bit: it is ok, if the striped activity log
+ * fits in the available on-disk activity log size.
+ * Right now, that would break how resize is implemented.
+ * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+ * of possible unused padding space in the on disk layout. */
+ if (in_core->al_offset < 0) {
+ if (in_core->bm_offset > in_core->al_offset)
+ goto err;
+ on_disk_al_sect = -in_core->al_offset;
+ on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+ } else {
+ if (in_core->al_offset != MD_4kB_SECT)
+ goto err;
+ if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+ on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+ }
+
+ /* old fixed size meta data is exactly that: fixed. */
+ if (in_core->meta_dev_idx >= 0) {
+ if (in_core->md_size_sect != MD_128MB_SECT
+ || in_core->al_offset != MD_4kB_SECT
+ || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+ || in_core->al_stripes != 1
+ || in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+ goto err;
+ }
+
+ if (capacity < in_core->md_size_sect)
+ goto err;
+ if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+ goto err;
+
+ /* should be aligned, and at least 32k */
+ if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+ goto err;
+
+ /* should fit (for now: exactly) into the available on-disk space;
+ * overflow prevention is in check_activity_log_stripe_size() above. */
+ if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ /* again, should be aligned */
+ if (in_core->bm_offset & 7)
+ goto err;
+
+ /* FIXME check for device grow with flex external meta data? */
+
+ /* can the available bitmap space cover the last agreed device size? */
+ if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+ goto err;
+
+ return 0;
+
+err:
+ drbd_err(device, "meta data offsets don't make sense: idx=%d "
+ "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+ "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+ in_core->meta_dev_idx,
+ in_core->al_stripes, in_core->al_stripe_size_4k,
+ in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+ (unsigned long long)in_core->la_size_sect,
+ (unsigned long long)capacity);
+
+ return -EINVAL;
+}
+
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @device: DRBD device.
+ * @bdev: Device from which the meta data should be read in.
+ *
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
+ * something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @device->ldev.
+ */
+int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+ struct meta_data_on_disk *buffer;
+ u32 magic, flags;
+ int i, rv = NO_ERROR;
+
+ if (device->state.disk != D_DISKLESS)
+ return ERR_DISK_CONFIGURED;
+
+ buffer = drbd_md_get_buffer(device, __func__);
+ if (!buffer)
+ return ERR_NOMEM;
+
+ /* First, figure out where our meta data superblock is located,
+ * and read it. */
+ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+ bdev->md.md_offset = drbd_md_ss(bdev);
+
+ if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
+ /* NOTE: can't do normal error processing here as this is
+ called BEFORE disk is attached */
+ drbd_err(device, "Error while reading metadata.\n");
+ rv = ERR_IO_MD_DISK;
+ goto err;
+ }
+
+ magic = be32_to_cpu(buffer->magic);
+ flags = be32_to_cpu(buffer->flags);
+ if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+ (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+ /* btw: that's Activity Log clean, not "all" clean. */
+ drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+ rv = ERR_MD_UNCLEAN;
+ goto err;
+ }
+
+ rv = ERR_MD_INVALID;
+ if (magic != DRBD_MD_MAGIC_08) {
+ if (magic == DRBD_MD_MAGIC_07)
+ drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+ else
+ drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
+ goto err;
+ }
+
+ if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+ drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+ be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+ goto err;
+ }
+
+
+ /* convert to in_core endian */
+ bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+ bdev->md.flags = be32_to_cpu(buffer->flags);
+ bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+ bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+ bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+ bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
+
+ if (check_activity_log_stripe_size(device, buffer, &bdev->md))
+ goto err;
+ if (check_offsets_and_sizes(device, bdev))
+ goto err;
+
+ if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+ drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
+ be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+ goto err;
+ }
+ if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+ drbd_err(device, "unexpected md_size: %u (expected %u)\n",
+ be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+ goto err;
+ }
+
+ rv = NO_ERROR;
+
+ spin_lock_irq(&device->resource->req_lock);
+ if (device->state.conn < C_CONNECTED) {
+ unsigned int peer;
+ peer = be32_to_cpu(buffer->la_peer_max_bio_size);
+ peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
+ device->peer_max_bio_size = peer;
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+
+ err:
+ drbd_md_put_buffer(device);
+
+ return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @device: DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+#ifdef DEBUG
+void drbd_md_mark_dirty_(struct drbd_device *device, unsigned int line, const char *func)
+{
+ if (!test_and_set_bit(MD_DIRTY, &device->flags)) {
+ mod_timer(&device->md_sync_timer, jiffies + HZ);
+ device->last_md_mark_dirty.line = line;
+ device->last_md_mark_dirty.func = func;
+ }
+}
+#else
+void drbd_md_mark_dirty(struct drbd_device *device)
+{
+ if (!test_and_set_bit(MD_DIRTY, &device->flags))
+ mod_timer(&device->md_sync_timer, jiffies + 5*HZ);
+}
+#endif
+
+void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
+{
+ int i;
+
+ for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+ device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i];
+}
+
+void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+ if (idx == UI_CURRENT) {
+ if (device->state.role == R_PRIMARY)
+ val |= 1;
+ else
+ val &= ~((u64)1);
+
+ drbd_set_ed_uuid(device, val);
+ }
+
+ device->ldev->md.uuid[idx] = val;
+ drbd_md_mark_dirty(device);
+}
+
+void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+ __drbd_uuid_set(device, idx, val);
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+ if (device->ldev->md.uuid[idx]) {
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
+ }
+ __drbd_uuid_set(device, idx, val);
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @device: DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
+{
+ u64 val;
+ unsigned long long bm_uuid;
+
+ get_random_bytes(&val, sizeof(u64));
+
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+
+ if (bm_uuid)
+ drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+ device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
+ __drbd_uuid_set(device, UI_CURRENT, val);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ drbd_print_uuids(device, "new current UUID");
+ /* get it to stable storage _now_ */
+ drbd_md_sync(device);
+}
+
+void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
+{
+ unsigned long flags;
+ if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+ return;
+
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+ if (val == 0) {
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+ device->ldev->md.uuid[UI_BITMAP] = 0;
+ } else {
+ unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+ if (bm_uuid)
+ drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+ device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
+ }
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+
+ drbd_md_mark_dirty(device);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device: DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
+{
+ int rv = -EIO;
+
+ drbd_md_set_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ drbd_bm_set_all(device);
+
+ rv = drbd_bm_write(device);
+
+ if (!rv) {
+ drbd_md_clear_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ }
+
+ return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device: DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
+{
+ drbd_resume_al(device);
+ drbd_bm_clear_all(device);
+ return drbd_bm_write(device);
+}
+
+static int w_bitmap_io(struct drbd_work *w, int unused)
+{
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, bm_io_work.w);
+ struct bm_io_work *work = &device->bm_io_work;
+ int rv = -EIO;
+
+ D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0);
+
+ if (get_ldev(device)) {
+ drbd_bm_lock(device, work->why, work->flags);
+ rv = work->io_fn(device);
+ drbd_bm_unlock(device);
+ put_ldev(device);
+ }
+
+ clear_bit_unlock(BITMAP_IO, &device->flags);
+ wake_up(&device->misc_wait);
+
+ if (work->done)
+ work->done(device, rv);
+
+ clear_bit(BITMAP_IO_QUEUED, &device->flags);
+ work->why = NULL;
+ work->flags = 0;
+
+ return 0;
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @device: DRBD device.
+ * @io_fn: IO callback to be called when bitmap IO is possible
+ * @done: callback to be called after the bitmap IO was performed
+ * @why: Descriptive text of the reason for doing the IO
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ *
+ * Its worker function encloses the call of io_fn() by get_ldev() and
+ * put_ldev().
+ */
+void drbd_queue_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ void (*done)(struct drbd_device *, int),
+ char *why, enum bm_flag flags)
+{
+ D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+ D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
+ D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
+ D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
+ if (device->bm_io_work.why)
+ drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
+ why, device->bm_io_work.why);
+
+ device->bm_io_work.io_fn = io_fn;
+ device->bm_io_work.done = done;
+ device->bm_io_work.why = why;
+ device->bm_io_work.flags = flags;
+
+ spin_lock_irq(&device->resource->req_lock);
+ set_bit(BITMAP_IO, &device->flags);
+ if (atomic_read(&device->ap_bio_cnt) == 0) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &device->bm_io_work.w);
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+}
+
+/**
+ * drbd_bitmap_io() - Does an IO operation on the whole bitmap
+ * @device: DRBD device.
+ * @io_fn: IO callback to be called when bitmap IO is possible
+ * @why: Descriptive text of the reason for doing the IO
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags)
+{
+ int rv;
+
+ D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+
+ if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+ drbd_suspend_io(device);
+
+ drbd_bm_lock(device, why, flags);
+ rv = io_fn(device);
+ drbd_bm_unlock(device);
+
+ if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+ drbd_resume_io(device);
+
+ return rv;
+}
+
+void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+ if ((device->ldev->md.flags & flag) != flag) {
+ drbd_md_mark_dirty(device);
+ device->ldev->md.flags |= flag;
+ }
+}
+
+void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+ if ((device->ldev->md.flags & flag) != 0) {
+ drbd_md_mark_dirty(device);
+ device->ldev->md.flags &= ~flag;
+ }
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+ return (bdev->md.flags & flag) != 0;
+}
+
+static void md_sync_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+ drbd_device_post_work(device, MD_SYNC);
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+ /* THINK may need to become several global tables
+ * when we want to support more than
+ * one PRO_VERSION */
+ static const char *cmdnames[] = {
+ [P_DATA] = "Data",
+ [P_DATA_REPLY] = "DataReply",
+ [P_RS_DATA_REPLY] = "RSDataReply",
+ [P_BARRIER] = "Barrier",
+ [P_BITMAP] = "ReportBitMap",
+ [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
+ [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
+ [P_UNPLUG_REMOTE] = "UnplugRemote",
+ [P_DATA_REQUEST] = "DataRequest",
+ [P_RS_DATA_REQUEST] = "RSDataRequest",
+ [P_SYNC_PARAM] = "SyncParam",
+ [P_SYNC_PARAM89] = "SyncParam89",
+ [P_PROTOCOL] = "ReportProtocol",
+ [P_UUIDS] = "ReportUUIDs",
+ [P_SIZES] = "ReportSizes",
+ [P_STATE] = "ReportState",
+ [P_SYNC_UUID] = "ReportSyncUUID",
+ [P_AUTH_CHALLENGE] = "AuthChallenge",
+ [P_AUTH_RESPONSE] = "AuthResponse",
+ [P_PING] = "Ping",
+ [P_PING_ACK] = "PingAck",
+ [P_RECV_ACK] = "RecvAck",
+ [P_WRITE_ACK] = "WriteAck",
+ [P_RS_WRITE_ACK] = "RSWriteAck",
+ [P_SUPERSEDED] = "Superseded",
+ [P_NEG_ACK] = "NegAck",
+ [P_NEG_DREPLY] = "NegDReply",
+ [P_NEG_RS_DREPLY] = "NegRSDReply",
+ [P_BARRIER_ACK] = "BarrierAck",
+ [P_STATE_CHG_REQ] = "StateChgRequest",
+ [P_STATE_CHG_REPLY] = "StateChgReply",
+ [P_OV_REQUEST] = "OVRequest",
+ [P_OV_REPLY] = "OVReply",
+ [P_OV_RESULT] = "OVResult",
+ [P_CSUM_RS_REQUEST] = "CsumRSRequest",
+ [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
+ [P_COMPRESSED_BITMAP] = "CBitmap",
+ [P_DELAY_PROBE] = "DelayProbe",
+ [P_OUT_OF_SYNC] = "OutOfSync",
+ [P_RETRY_WRITE] = "RetryWrite",
+ [P_RS_CANCEL] = "RSCancel",
+ [P_CONN_ST_CHG_REQ] = "conn_st_chg_req",
+ [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
+ [P_RETRY_WRITE] = "retry_write",
+ [P_PROTOCOL_UPDATE] = "protocol_update",
+
+ /* enum drbd_packet, but not commands - obsoleted flags:
+ * P_MAY_IGNORE
+ * P_MAX_OPT_CMD
+ */
+ };
+
+ /* too big for the array: 0xfffX */
+ if (cmd == P_INITIAL_META)
+ return "InitialMeta";
+ if (cmd == P_INITIAL_DATA)
+ return "InitialData";
+ if (cmd == P_CONNECTION_FEATURES)
+ return "ConnectionFeatures";
+ if (cmd >= ARRAY_SIZE(cmdnames))
+ return "Unknown";
+ return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc - wait for a request to make progress
+ * @device: device associated with the request
+ * @i: the struct drbd_interval embedded in struct drbd_request or
+ * struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
+{
+ struct net_conf *nc;
+ DEFINE_WAIT(wait);
+ long timeout;
+
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -ETIMEDOUT;
+ }
+ timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+ rcu_read_unlock();
+
+ /* Indicate to wake up device->misc_wait on progress. */
+ i->waiting = true;
+ prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&device->resource->req_lock);
+ timeout = schedule_timeout(timeout);
+ finish_wait(&device->misc_wait, &wait);
+ spin_lock_irq(&device->resource->req_lock);
+ if (!timeout || device->state.conn < C_CONNECTED)
+ return -ETIMEDOUT;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ return 0;
+}
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+ unsigned long state;
+ unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801 /* prime */
+#define FAULT_RANDOM_ADD 479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator. Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+ long refresh;
+
+ if (!rsp->count--) {
+ get_random_bytes(&refresh, sizeof(refresh));
+ rsp->state += refresh;
+ rsp->count = FAULT_RANDOM_REFRESH;
+ }
+ rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+ return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+ static char *_faults[] = {
+ [DRBD_FAULT_MD_WR] = "Meta-data write",
+ [DRBD_FAULT_MD_RD] = "Meta-data read",
+ [DRBD_FAULT_RS_WR] = "Resync write",
+ [DRBD_FAULT_RS_RD] = "Resync read",
+ [DRBD_FAULT_DT_WR] = "Data write",
+ [DRBD_FAULT_DT_RD] = "Data read",
+ [DRBD_FAULT_DT_RA] = "Data read ahead",
+ [DRBD_FAULT_BM_ALLOC] = "BM allocation",
+ [DRBD_FAULT_AL_EE] = "EE allocation",
+ [DRBD_FAULT_RECEIVE] = "receive data corruption",
+ };
+
+ return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type)
+{
+ static struct fault_random_state rrs = {0, 0};
+
+ unsigned int ret = (
+ (fault_devs == 0 ||
+ ((1 << device_to_minor(device)) & fault_devs) != 0) &&
+ (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
+
+ if (ret) {
+ fault_count++;
+
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_warn(device, "***Simulating %s failure\n",
+ _drbd_fault_str(type));
+ }
+
+ return ret;
+}
+#endif
+
+const char *drbd_buildtag(void)
+{
+ /* DRBD built from external sources has here a reference to the
+ git hash of the source code. */
+
+ static char buildtag[38] = "\0uilt-in";
+
+ if (buildtag[0] == 0) {
+#ifdef MODULE
+ sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+ buildtag[0] = 'b';
+#endif
+ }
+
+ return buildtag;
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000..74df8cfad
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,3674 @@
+/*
+ drbd_nl.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_limits.h>
+#include <linux/kthread.h>
+
+#include <net/genetlink.h>
+
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+ genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+ if (genlmsg_reply(skb, info))
+ pr_err("error sending genl reply\n");
+}
+
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+static int drbd_msg_put_info(struct sk_buff *skb, const char *info)
+{
+ struct nlattr *nla;
+ int err = -EMSGSIZE;
+
+ if (!info || !info[0])
+ return 0;
+
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+ if (!nla)
+ return err;
+
+ err = nla_put_string(skb, T_info_text, info);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
+ } else
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ *
+ * At this point, we still rely on the global genl_lock().
+ * If we want to avoid that, and allow "genl_family.parallel_ops", we may need
+ * to add additional synchronization against object destruction/modification.
+ */
+#define DRBD_ADM_NEED_MINOR 1
+#define DRBD_ADM_NEED_RESOURCE 2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
+ struct sk_buff *skb, struct genl_info *info, unsigned flags)
+{
+ struct drbd_genlmsghdr *d_in = info->userhdr;
+ const u8 cmd = info->genlhdr->cmd;
+ int err;
+
+ memset(adm_ctx, 0, sizeof(*adm_ctx));
+
+ /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!adm_ctx->reply_skb) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
+ info, &drbd_genl_family, 0, cmd);
+ /* put of a few bytes into a fresh skb of >= 4k will always succeed.
+ * but anyways */
+ if (!adm_ctx->reply_dh) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx->reply_dh->minor = d_in->minor;
+ adm_ctx->reply_dh->ret_code = NO_ERROR;
+
+ adm_ctx->volume = VOLUME_UNSPECIFIED;
+ if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+ struct nlattr *nla;
+ /* parse and validate only */
+ err = drbd_cfg_context_from_attrs(NULL, info);
+ if (err)
+ goto fail;
+
+ /* It was present, and valid,
+ * copy it over to the reply skb. */
+ err = nla_put_nohdr(adm_ctx->reply_skb,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]);
+ if (err)
+ goto fail;
+
+ /* and assign stuff to the adm_ctx */
+ nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+ if (nla)
+ adm_ctx->volume = nla_get_u32(nla);
+ nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+ if (nla)
+ adm_ctx->resource_name = nla_data(nla);
+ adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+ adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+ if ((adm_ctx->my_addr &&
+ nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
+ (adm_ctx->peer_addr &&
+ nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ adm_ctx->minor = d_in->minor;
+ adm_ctx->device = minor_to_device(d_in->minor);
+
+ /* We are protected by the global genl_lock().
+ * But we may explicitly drop it/retake it in drbd_adm_set_role(),
+ * so make sure this object stays around. */
+ if (adm_ctx->device)
+ kref_get(&adm_ctx->device->kref);
+
+ if (adm_ctx->resource_name) {
+ adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
+ }
+
+ if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor");
+ return ERR_MINOR_INVALID;
+ }
+ if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource");
+ if (adm_ctx->resource_name)
+ return ERR_RES_NOT_KNOWN;
+ return ERR_INVALID_REQUEST;
+ }
+
+ if (flags & DRBD_ADM_NEED_CONNECTION) {
+ if (adm_ctx->resource) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->device) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->my_addr && adm_ctx->peer_addr)
+ adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
+ nla_len(adm_ctx->my_addr),
+ nla_data(adm_ctx->peer_addr),
+ nla_len(adm_ctx->peer_addr));
+ if (!adm_ctx->connection) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection");
+ return ERR_INVALID_REQUEST;
+ }
+ }
+
+ /* some more paranoia, if the request was over-determined */
+ if (adm_ctx->device && adm_ctx->resource &&
+ adm_ctx->device->resource != adm_ctx->resource) {
+ pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
+ adm_ctx->minor, adm_ctx->resource->name,
+ adm_ctx->device->resource->name);
+ drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->device &&
+ adm_ctx->volume != VOLUME_UNSPECIFIED &&
+ adm_ctx->volume != adm_ctx->device->vnr) {
+ pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+ adm_ctx->minor, adm_ctx->volume,
+ adm_ctx->device->vnr,
+ adm_ctx->device->resource->name);
+ drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume");
+ return ERR_INVALID_REQUEST;
+ }
+
+ /* still, provide adm_ctx->resource always, if possible. */
+ if (!adm_ctx->resource) {
+ adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
+ : adm_ctx->connection ? adm_ctx->connection->resource : NULL;
+ if (adm_ctx->resource)
+ kref_get(&adm_ctx->resource->kref);
+ }
+
+ return NO_ERROR;
+
+fail:
+ nlmsg_free(adm_ctx->reply_skb);
+ adm_ctx->reply_skb = NULL;
+ return err;
+}
+
+static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
+ struct genl_info *info, int retcode)
+{
+ if (adm_ctx->device) {
+ kref_put(&adm_ctx->device->kref, drbd_destroy_device);
+ adm_ctx->device = NULL;
+ }
+ if (adm_ctx->connection) {
+ kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
+ adm_ctx->connection = NULL;
+ }
+ if (adm_ctx->resource) {
+ kref_put(&adm_ctx->resource->kref, drbd_destroy_resource);
+ adm_ctx->resource = NULL;
+ }
+
+ if (!adm_ctx->reply_skb)
+ return -ENOMEM;
+
+ adm_ctx->reply_dh->ret_code = retcode;
+ drbd_adm_send_reply(adm_ctx->reply_skb, info);
+ return 0;
+}
+
+static void setup_khelper_env(struct drbd_connection *connection, char **envp)
+{
+ char *afs;
+
+ /* FIXME: A future version will not allow this case. */
+ if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
+ return;
+
+ switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
+ case AF_INET6:
+ afs = "ipv6";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+ &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
+ break;
+ case AF_INET:
+ afs = "ipv4";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+ break;
+ default:
+ afs = "ssocks";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+ }
+ snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
+
+int drbd_khelper(struct drbd_device *device, char *cmd)
+{
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
+ NULL };
+ char mb[12];
+ char *argv[] = {usermode_helper, cmd, mb, NULL };
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct sib_info sib;
+ int ret;
+
+ if (current == connection->worker.task)
+ set_bit(CALLBACK_PENDING, &connection->flags);
+
+ snprintf(mb, 12, "minor-%d", device_to_minor(device));
+ setup_khelper_env(connection, envp);
+
+ /* The helper may take some time.
+ * write out any unsynced meta data changes now */
+ drbd_md_sync(device);
+
+ drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+ sib.sib_reason = SIB_HELPER_PRE;
+ sib.helper_name = cmd;
+ drbd_bcast_event(device, &sib);
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+ if (ret)
+ drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, mb,
+ (ret >> 8) & 0xff, ret);
+ else
+ drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, mb,
+ (ret >> 8) & 0xff, ret);
+ sib.sib_reason = SIB_HELPER_POST;
+ sib.helper_exit_code = ret;
+ drbd_bcast_event(device, &sib);
+
+ if (current == connection->worker.task)
+ clear_bit(CALLBACK_PENDING, &connection->flags);
+
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+}
+
+static int conn_khelper(struct drbd_connection *connection, char *cmd)
+{
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
+ NULL };
+ char *resource_name = connection->resource->name;
+ char *argv[] = {usermode_helper, cmd, resource_name, NULL };
+ int ret;
+
+ setup_khelper_env(connection, envp);
+ conn_md_sync(connection);
+
+ drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
+ /* TODO: conn_bcast_event() ?? */
+
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+ if (ret)
+ drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, resource_name,
+ (ret >> 8) & 0xff, ret);
+ else
+ drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, resource_name,
+ (ret >> 8) & 0xff, ret);
+ /* TODO: conn_bcast_event() ?? */
+
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+}
+
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
+{
+ enum drbd_fencing_p fp = FP_NOT_AVAIL;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (get_ldev_if_state(device, D_CONSISTENT)) {
+ struct disk_conf *disk_conf =
+ rcu_dereference(peer_device->device->ldev->disk_conf);
+ fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing);
+ put_ldev(device);
+ }
+ }
+ rcu_read_unlock();
+
+ if (fp == FP_NOT_AVAIL) {
+ /* IO Suspending works on the whole resource.
+ Do it only for one device. */
+ vnr = 0;
+ peer_device = idr_get_next(&connection->peer_devices, &vnr);
+ drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
+ }
+
+ return fp;
+}
+
+bool conn_try_outdate_peer(struct drbd_connection *connection)
+{
+ unsigned int connect_cnt;
+ union drbd_state mask = { };
+ union drbd_state val = { };
+ enum drbd_fencing_p fp;
+ char *ex_to_string;
+ int r;
+
+ spin_lock_irq(&connection->resource->req_lock);
+ if (connection->cstate >= C_WF_REPORT_PARAMS) {
+ drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
+ spin_unlock_irq(&connection->resource->req_lock);
+ return false;
+ }
+
+ connect_cnt = connection->connect_cnt;
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ fp = highest_fencing_policy(connection);
+ switch (fp) {
+ case FP_NOT_AVAIL:
+ drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
+ goto out;
+ case FP_DONT_CARE:
+ return true;
+ default: ;
+ }
+
+ r = conn_khelper(connection, "fence-peer");
+
+ switch ((r>>8) & 0xff) {
+ case 3: /* peer is inconsistent */
+ ex_to_string = "peer is inconsistent or worse";
+ mask.pdsk = D_MASK;
+ val.pdsk = D_INCONSISTENT;
+ break;
+ case 4: /* peer got outdated, or was already outdated */
+ ex_to_string = "peer was fenced";
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+ break;
+ case 5: /* peer was down */
+ if (conn_highest_disk(connection) == D_UP_TO_DATE) {
+ /* we will(have) create(d) a new UUID anyways... */
+ ex_to_string = "peer is unreachable, assumed to be dead";
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+ } else {
+ ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+ }
+ break;
+ case 6: /* Peer is primary, voluntarily outdate myself.
+ * This is useful when an unconnected R_SECONDARY is asked to
+ * become R_PRIMARY, but finds the other peer being active. */
+ ex_to_string = "peer is active";
+ drbd_warn(connection, "Peer is primary, outdating myself.\n");
+ mask.disk = D_MASK;
+ val.disk = D_OUTDATED;
+ break;
+ case 7:
+ if (fp != FP_STONITH)
+ drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
+ ex_to_string = "peer was stonithed";
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+ break;
+ default:
+ /* The script is broken ... */
+ drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+ return false; /* Eventually leave IO frozen */
+ }
+
+ drbd_info(connection, "fence-peer helper returned %d (%s)\n",
+ (r>>8) & 0xff, ex_to_string);
+
+ out:
+
+ /* Not using
+ conn_request_state(connection, mask, val, CS_VERBOSE);
+ here, because we might were able to re-establish the connection in the
+ meantime. */
+ spin_lock_irq(&connection->resource->req_lock);
+ if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
+ if (connection->connect_cnt != connect_cnt)
+ /* In case the connection was established and droped
+ while the fence-peer handler was running, ignore it */
+ drbd_info(connection, "Ignoring fence-peer exit code\n");
+ else
+ _conn_request_state(connection, mask, val, CS_VERBOSE);
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ return conn_highest_pdsk(connection) <= D_OUTDATED;
+}
+
+static int _try_outdate_peer_async(void *data)
+{
+ struct drbd_connection *connection = (struct drbd_connection *)data;
+
+ conn_try_outdate_peer(connection);
+
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return 0;
+}
+
+void conn_try_outdate_peer_async(struct drbd_connection *connection)
+{
+ struct task_struct *opa;
+
+ kref_get(&connection->kref);
+ /* We may just have force_sig()'ed this thread
+ * to get it out of some blocking network function.
+ * Clear signals; otherwise kthread_run(), which internally uses
+ * wait_on_completion_killable(), will mistake our pending signal
+ * for a new fatal signal and fail. */
+ flush_signals(current);
+ opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
+ if (IS_ERR(opa)) {
+ drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n");
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
+}
+
+enum drbd_state_rv
+drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
+{
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+ const int max_tries = 4;
+ enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+ struct net_conf *nc;
+ int try = 0;
+ int forced = 0;
+ union drbd_state mask, val;
+
+ if (new_role == R_PRIMARY) {
+ struct drbd_connection *connection;
+
+ /* Detect dead peers as soon as possible. */
+
+ rcu_read_lock();
+ for_each_connection(connection, device->resource)
+ request_ping(connection);
+ rcu_read_unlock();
+ }
+
+ mutex_lock(device->state_mutex);
+
+ mask.i = 0; mask.role = R_MASK;
+ val.i = 0; val.role = new_role;
+
+ while (try++ < max_tries) {
+ rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE);
+
+ /* in case we first succeeded to outdate,
+ * but now suddenly could establish a connection */
+ if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+ val.pdsk = 0;
+ mask.pdsk = 0;
+ continue;
+ }
+
+ if (rv == SS_NO_UP_TO_DATE_DISK && force &&
+ (device->state.disk < D_UP_TO_DATE &&
+ device->state.disk >= D_INCONSISTENT)) {
+ mask.disk = D_MASK;
+ val.disk = D_UP_TO_DATE;
+ forced = 1;
+ continue;
+ }
+
+ if (rv == SS_NO_UP_TO_DATE_DISK &&
+ device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+ D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
+
+ if (conn_try_outdate_peer(connection)) {
+ val.disk = D_UP_TO_DATE;
+ mask.disk = D_MASK;
+ }
+ continue;
+ }
+
+ if (rv == SS_NOTHING_TO_DO)
+ goto out;
+ if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
+ if (!conn_try_outdate_peer(connection) && force) {
+ drbd_warn(device, "Forced into split brain situation!\n");
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+
+ }
+ continue;
+ }
+ if (rv == SS_TWO_PRIMARIES) {
+ /* Maybe the peer is detected as dead very soon...
+ retry at most once more in this case. */
+ int timeo;
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
+ if (try < max_tries)
+ try = max_tries - 1;
+ continue;
+ }
+ if (rv < SS_SUCCESS) {
+ rv = _drbd_request_state(device, mask, val,
+ CS_VERBOSE + CS_WAIT_COMPLETE);
+ if (rv < SS_SUCCESS)
+ goto out;
+ }
+ break;
+ }
+
+ if (rv < SS_SUCCESS)
+ goto out;
+
+ if (forced)
+ drbd_warn(device, "Forced to consider local data as UpToDate!\n");
+
+ /* Wait until nothing is on the fly :) */
+ wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
+
+ /* FIXME also wait for all pending P_BARRIER_ACK? */
+
+ if (new_role == R_SECONDARY) {
+ if (get_ldev(device)) {
+ device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+ put_ldev(device);
+ }
+ } else {
+ mutex_lock(&device->resource->conf_update);
+ nc = connection->net_conf;
+ if (nc)
+ nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+ mutex_unlock(&device->resource->conf_update);
+
+ if (get_ldev(device)) {
+ if (((device->state.conn < C_CONNECTED ||
+ device->state.pdsk <= D_FAILED)
+ && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+ drbd_uuid_new_current(device);
+
+ device->ldev->md.uuid[UI_CURRENT] |= (u64)1;
+ put_ldev(device);
+ }
+ }
+
+ /* writeout of activity log covered areas of the bitmap
+ * to stable storage done in after state change already */
+
+ if (device->state.conn >= C_WF_REPORT_PARAMS) {
+ /* if this was forced, we should consider sync */
+ if (forced)
+ drbd_send_uuids(peer_device);
+ drbd_send_current_state(peer_device);
+ }
+
+ drbd_md_sync(device);
+ set_disk_ro(device->vdisk, new_role == R_SECONDARY);
+ kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+out:
+ mutex_unlock(device->state_mutex);
+ return rv;
+}
+
+static const char *from_attrs_err_to_txt(int err)
+{
+ return err == -ENOMSG ? "required attribute missing" :
+ err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+ err == -EEXIST ? "can not change invariant setting" :
+ "invalid attribute value";
+}
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct set_role_parms parms;
+ int err;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+ err = set_role_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+ genl_unlock();
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+ retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
+ else
+ retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
+
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ genl_lock();
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * Activity log size used to be fixed 32kB,
+ * but is about to become configurable.
+ */
+static void drbd_md_set_sector_offsets(struct drbd_device *device,
+ struct drbd_backing_dev *bdev)
+{
+ sector_t md_size_sect = 0;
+ unsigned int al_size_sect = bdev->md.al_size_4k * 8;
+
+ bdev->md.md_offset = drbd_md_ss(bdev);
+
+ switch (bdev->md.meta_dev_idx) {
+ default:
+ /* v07 style fixed size indexed meta data */
+ bdev->md.md_size_sect = MD_128MB_SECT;
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+ break;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ /* just occupy the full device; unit: sectors */
+ bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+ break;
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ /* al size is still fixed */
+ bdev->md.al_offset = -al_size_sect;
+ /* we need (slightly less than) ~ this much bitmap sectors: */
+ md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+ md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+ md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+ md_size_sect = ALIGN(md_size_sect, 8);
+
+ /* plus the "drbd meta data super block",
+ * and the activity log; */
+ md_size_sect += MD_4kB_SECT + al_size_sect;
+
+ bdev->md.md_size_sect = md_size_sect;
+ /* bitmap offset is adjusted by 'super' block size */
+ bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
+ break;
+ }
+}
+
+/* input size is expected to be in KB */
+char *ppsize(char *buf, unsigned long long size)
+{
+ /* Needs 9 bytes at max including trailing NUL:
+ * -1ULL ==> "16384 EB" */
+ static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+ int base = 0;
+ while (size >= 10000 && base < sizeof(units)-1) {
+ /* shift + round */
+ size = (size >> 10) + !!(size & (1<<9));
+ base++;
+ }
+ sprintf(buf, "%u %cB", (unsigned)size, units[base]);
+
+ return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ * remote READ does inc_ap_bio, receiver would need to receive answer
+ * packet from remote to dec_ap_bio again.
+ * receiver receive_sizes(), comes here,
+ * waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ * (not connected, or bad/no disk on peer):
+ * see drbd_fail_request_early, ap_bio_cnt is zero.
+ * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ * peer may not initiate a resize.
+ */
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an device->flag, is triggered by drbd internals,
+ * and should be short-lived. */
+void drbd_suspend_io(struct drbd_device *device)
+{
+ set_bit(SUSPEND_IO, &device->flags);
+ if (drbd_suspended(device))
+ return;
+ wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_device *device)
+{
+ clear_bit(SUSPEND_IO, &device->flags);
+ wake_up(&device->misc_wait);
+}
+
+/**
+ * drbd_determine_dev_size() - Sets the right device size obeying all constraints
+ * @device: DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
+{
+ sector_t prev_first_sect, prev_size; /* previous meta location */
+ sector_t la_size_sect, u_size;
+ struct drbd_md *md = &device->ldev->md;
+ u32 prev_al_stripe_size_4k;
+ u32 prev_al_stripes;
+ sector_t size;
+ char ppb[10];
+ void *buffer;
+
+ int md_moved, la_size_changed;
+ enum determine_dev_size rv = DS_UNCHANGED;
+
+ /* race:
+ * application request passes inc_ap_bio,
+ * but then cannot get an AL-reference.
+ * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+ *
+ * to avoid that:
+ * Suspend IO right here.
+ * still lock the act_log to not trigger ASSERTs there.
+ */
+ drbd_suspend_io(device);
+ buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
+ if (!buffer) {
+ drbd_resume_io(device);
+ return DS_ERROR;
+ }
+
+ /* no wait necessary anymore, actually we could assert that */
+ wait_event(device->al_wait, lc_try_lock(device->act_log));
+
+ prev_first_sect = drbd_md_first_sector(device->ldev);
+ prev_size = device->ldev->md.md_size_sect;
+ la_size_sect = device->ldev->md.la_size_sect;
+
+ if (rs) {
+ /* rs is non NULL if we should change the AL layout only */
+
+ prev_al_stripes = md->al_stripes;
+ prev_al_stripe_size_4k = md->al_stripe_size_4k;
+
+ md->al_stripes = rs->al_stripes;
+ md->al_stripe_size_4k = rs->al_stripe_size / 4;
+ md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
+ }
+
+ drbd_md_set_sector_offsets(device, device->ldev);
+
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
+
+ if (size < la_size_sect) {
+ if (rs && u_size == 0) {
+ /* Remove "rs &&" later. This check should always be active, but
+ right now the receiver expects the permissive behavior */
+ drbd_warn(device, "Implicit shrink not allowed. "
+ "Use --size=%llus for explicit shrink.\n",
+ (unsigned long long)size);
+ rv = DS_ERROR_SHRINK;
+ }
+ if (u_size > size)
+ rv = DS_ERROR_SPACE_MD;
+ if (rv != DS_UNCHANGED)
+ goto err_out;
+ }
+
+ if (drbd_get_capacity(device->this_bdev) != size ||
+ drbd_bm_capacity(device) != size) {
+ int err;
+ err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
+ if (unlikely(err)) {
+ /* currently there is only one error: ENOMEM! */
+ size = drbd_bm_capacity(device)>>1;
+ if (size == 0) {
+ drbd_err(device, "OUT OF MEMORY! "
+ "Could not allocate bitmap!\n");
+ } else {
+ drbd_err(device, "BM resizing failed. "
+ "Leaving size unchanged at size = %lu KB\n",
+ (unsigned long)size);
+ }
+ rv = DS_ERROR;
+ }
+ /* racy, see comments above. */
+ drbd_set_my_capacity(device, size);
+ device->ldev->md.la_size_sect = size;
+ drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+ (unsigned long long)size>>1);
+ }
+ if (rv <= DS_ERROR)
+ goto err_out;
+
+ la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+
+ md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
+ || prev_size != device->ldev->md.md_size_sect;
+
+ if (la_size_changed || md_moved || rs) {
+ u32 prev_flags;
+
+ /* We do some synchronous IO below, which may take some time.
+ * Clear the timer, to avoid scary "timer expired!" messages,
+ * "Superblock" is written out at least twice below, anyways. */
+ del_timer(&device->md_sync_timer);
+ drbd_al_shrink(device); /* All extents inactive. */
+
+ prev_flags = md->flags;
+ md->flags &= ~MDF_PRIMARY_IND;
+ drbd_md_write(device, buffer);
+
+ drbd_info(device, "Writing the whole bitmap, %s\n",
+ la_size_changed && md_moved ? "size changed and md moved" :
+ la_size_changed ? "size changed" : "md moved");
+ /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
+ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+ "size changed", BM_LOCKED_MASK);
+ drbd_initialize_al(device, buffer);
+
+ md->flags = prev_flags;
+ drbd_md_write(device, buffer);
+
+ if (rs)
+ drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
+ md->al_stripes, md->al_stripe_size_4k * 4);
+ }
+
+ if (size > la_size_sect)
+ rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+ if (size < la_size_sect)
+ rv = DS_SHRUNK;
+
+ if (0) {
+ err_out:
+ if (rs) {
+ md->al_stripes = prev_al_stripes;
+ md->al_stripe_size_4k = prev_al_stripe_size_4k;
+ md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
+
+ drbd_md_set_sector_offsets(device, device->ldev);
+ }
+ }
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
+ drbd_md_put_buffer(device);
+ drbd_resume_io(device);
+
+ return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ sector_t u_size, int assume_peer_has_space)
+{
+ sector_t p_size = device->p_size; /* partner's disk size. */
+ sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
+ sector_t m_size; /* my size */
+ sector_t size = 0;
+
+ m_size = drbd_get_max_capacity(bdev);
+
+ if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
+ drbd_warn(device, "Resize while not connected was forced by the user!\n");
+ p_size = m_size;
+ }
+
+ if (p_size && m_size) {
+ size = min_t(sector_t, p_size, m_size);
+ } else {
+ if (la_size_sect) {
+ size = la_size_sect;
+ if (m_size && m_size < size)
+ size = m_size;
+ if (p_size && p_size < size)
+ size = p_size;
+ } else {
+ if (m_size)
+ size = m_size;
+ if (p_size)
+ size = p_size;
+ }
+ }
+
+ if (size == 0)
+ drbd_err(device, "Both nodes diskless!\n");
+
+ if (u_size) {
+ if (u_size > size)
+ drbd_err(device, "Requested disk size is too big (%lu > %lu)\n",
+ (unsigned long)u_size>>1, (unsigned long)size>>1);
+ else
+ size = u_size;
+ }
+
+ return size;
+}
+
+/**
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @device: DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
+{
+ struct lru_cache *n, *t;
+ struct lc_element *e;
+ unsigned int in_use;
+ int i;
+
+ if (device->act_log &&
+ device->act_log->nr_elements == dc->al_extents)
+ return 0;
+
+ in_use = 0;
+ t = device->act_log;
+ n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+ dc->al_extents, sizeof(struct lc_element), 0);
+
+ if (n == NULL) {
+ drbd_err(device, "Cannot allocate act_log lru!\n");
+ return -ENOMEM;
+ }
+ spin_lock_irq(&device->al_lock);
+ if (t) {
+ for (i = 0; i < t->nr_elements; i++) {
+ e = lc_element_by_index(t, i);
+ if (e->refcnt)
+ drbd_err(device, "refcnt(%d)==%d\n",
+ e->lc_number, e->refcnt);
+ in_use += e->refcnt;
+ }
+ }
+ if (!in_use)
+ device->act_log = n;
+ spin_unlock_irq(&device->al_lock);
+ if (in_use) {
+ drbd_err(device, "Activity log still in use!\n");
+ lc_destroy(n);
+ return -EBUSY;
+ } else {
+ if (t)
+ lc_destroy(t);
+ }
+ drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
+ return 0;
+}
+
+static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ unsigned int max_bio_size)
+{
+ struct request_queue * const q = device->rq_queue;
+ unsigned int max_hw_sectors = max_bio_size >> 9;
+ unsigned int max_segments = 0;
+ struct request_queue *b = NULL;
+
+ if (bdev) {
+ b = bdev->backing_bdev->bd_disk->queue;
+
+ max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+ rcu_read_lock();
+ max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
+ rcu_read_unlock();
+
+ blk_set_stacking_limits(&q->limits);
+ blk_queue_max_write_same_sectors(q, 0);
+ }
+
+ blk_queue_logical_block_size(q, 512);
+ blk_queue_max_hw_sectors(q, max_hw_sectors);
+ /* This is the workaround for "bio would need to, but cannot, be split" */
+ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
+ blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
+
+ if (b) {
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+
+ if (blk_queue_discard(b) &&
+ (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
+ /* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+ q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
+
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ /* REALLY? Is stacking secdiscard "legal"? */
+ if (blk_queue_secdiscard(b))
+ queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
+ } else {
+ q->limits.max_discard_sectors = 0;
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+ queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+ }
+
+ blk_queue_stack_limits(q, b);
+
+ if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+ drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+ q->backing_dev_info.ra_pages,
+ b->backing_dev_info.ra_pages);
+ q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+ }
+ }
+}
+
+void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+ unsigned int now, new, local, peer;
+
+ now = queue_max_hw_sectors(device->rq_queue) << 9;
+ local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
+ peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
+
+ if (bdev) {
+ local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
+ device->local_max_bio_size = local;
+ }
+ local = min(local, DRBD_MAX_BIO_SIZE);
+
+ /* We may ignore peer limits if the peer is modern enough.
+ Because new from 8.3.8 onwards the peer can use multiple
+ BIOs for a single peer_request */
+ if (device->state.conn >= C_WF_REPORT_PARAMS) {
+ if (first_peer_device(device)->connection->agreed_pro_version < 94)
+ peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+ else if (first_peer_device(device)->connection->agreed_pro_version == 94)
+ peer = DRBD_MAX_SIZE_H80_PACKET;
+ else if (first_peer_device(device)->connection->agreed_pro_version < 100)
+ peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
+ else
+ peer = DRBD_MAX_BIO_SIZE;
+
+ /* We may later detach and re-attach on a disconnected Primary.
+ * Avoid this setting to jump back in that case.
+ * We want to store what we know the peer DRBD can handle,
+ * not what the peer IO backend can handle. */
+ if (peer > device->peer_max_bio_size)
+ device->peer_max_bio_size = peer;
+ }
+ new = min(local, peer);
+
+ if (device->state.role == R_PRIMARY && new < now)
+ drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
+
+ if (new != now)
+ drbd_info(device, "max BIO size = %u\n", new);
+
+ drbd_setup_queue_param(device, bdev, new);
+}
+
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_connection *connection)
+{
+ drbd_thread_start(&connection->worker);
+ drbd_flush_workqueue(&connection->sender_work);
+}
+
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_connection *connection)
+{
+ bool stop_threads;
+ spin_lock_irq(&connection->resource->req_lock);
+ stop_threads = conn_all_vols_unconf(connection) &&
+ connection->cstate == C_STANDALONE;
+ spin_unlock_irq(&connection->resource->req_lock);
+ if (stop_threads) {
+ /* asender is implicitly stopped by receiver
+ * in conn_disconnect() */
+ drbd_thread_stop(&connection->receiver);
+ drbd_thread_stop(&connection->worker);
+ }
+}
+
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_device *device)
+{
+ int s = 0;
+
+ if (!lc_try_lock(device->act_log)) {
+ drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n");
+ return;
+ }
+
+ drbd_al_shrink(device);
+ spin_lock_irq(&device->resource->req_lock);
+ if (device->state.conn < C_CONNECTED)
+ s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
+ spin_unlock_irq(&device->resource->req_lock);
+ lc_unlock(device->act_log);
+
+ if (s)
+ drbd_info(device, "Suspended AL updates\n");
+}
+
+
+static bool should_set_defaults(struct genl_info *info)
+{
+ unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+ return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
+{
+ /* This is limited by 16 bit "slot" numbers,
+ * and by available on-disk context storage.
+ *
+ * Also (u16)~0 is special (denotes a "free" extent).
+ *
+ * One transaction occupies one 4kB on-disk block,
+ * we have n such blocks in the on disk ring buffer,
+ * the "current" transaction may fail (n-1),
+ * and there is 919 slot numbers context information per transaction.
+ *
+ * 72 transaction blocks amounts to more than 2**16 context slots,
+ * so cap there first.
+ */
+ const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+ const unsigned int sufficient_on_disk =
+ (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+ /AL_CONTEXT_PER_TRANSACTION;
+
+ unsigned int al_size_4k = bdev->md.al_size_4k;
+
+ if (al_size_4k > sufficient_on_disk)
+ return max_al_nr;
+
+ return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
+}
+
+static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
+{
+ return a->disk_barrier != b->disk_barrier ||
+ a->disk_flushes != b->disk_flushes ||
+ a->disk_drain != b->disk_drain;
+}
+
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct drbd_device *device;
+ struct disk_conf *new_disk_conf, *old_disk_conf;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+ int err, fifo_size;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ device = adm_ctx.device;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* we also need a disk
+ * to change the options on */
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ mutex_lock(&device->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ if (should_set_defaults(info))
+ set_disk_conf_defaults(new_disk_conf);
+
+ err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail_unlock;
+ }
+
+ if (!expect(new_disk_conf->resync_rate >= 1))
+ new_disk_conf->resync_rate = 1;
+
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
+ new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
+
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != device->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
+ drbd_err(device, "kmalloc of fifo_buffer failed");
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+ }
+
+ drbd_suspend_io(device);
+ wait_event(device->al_wait, lc_try_lock(device->act_log));
+ drbd_al_shrink(device);
+ err = drbd_check_al_size(device, new_disk_conf);
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
+ drbd_resume_io(device);
+
+ if (err) {
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+ if (retcode == NO_ERROR) {
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ drbd_resync_after_changed(device);
+ }
+ write_unlock_irq(&global_state_lock);
+
+ if (retcode != NO_ERROR)
+ goto fail_unlock;
+
+ if (new_plan) {
+ old_plan = device->rs_plan_s;
+ rcu_assign_pointer(device->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&device->resource->conf_update);
+
+ if (new_disk_conf->al_updates)
+ device->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ device->ldev->md.flags |= MDF_AL_DISABLED;
+
+ if (new_disk_conf->md_flushes)
+ clear_bit(MD_NO_FUA, &device->flags);
+ else
+ set_bit(MD_NO_FUA, &device->flags);
+
+ if (write_ordering_changed(old_disk_conf, new_disk_conf))
+ drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
+
+ drbd_md_sync(device);
+
+ if (device->state.conn >= C_CONNECTED) {
+ struct drbd_peer_device *peer_device;
+
+ for_each_peer_device(peer_device, device)
+ drbd_send_sync_param(peer_device);
+ }
+
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ kfree(old_plan);
+ mod_timer(&device->request_timer, jiffies + HZ);
+ goto success;
+
+fail_unlock:
+ mutex_unlock(&device->resource->conf_update);
+ fail:
+ kfree(new_disk_conf);
+ kfree(new_plan);
+success:
+ put_ldev(device);
+ out:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ struct drbd_peer_device *peer_device;
+ struct drbd_connection *connection;
+ int err;
+ enum drbd_ret_code retcode;
+ enum determine_dev_size dd;
+ sector_t max_possible_sectors;
+ sector_t min_md_device_sectors;
+ struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+ struct disk_conf *new_disk_conf = NULL;
+ struct block_device *bdev;
+ struct lru_cache *resync_lru = NULL;
+ struct fifo_buffer *new_plan = NULL;
+ union drbd_state ns, os;
+ enum drbd_state_rv rv;
+ struct net_conf *nc;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ device = adm_ctx.device;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ peer_device = first_peer_device(device);
+ connection = peer_device ? peer_device->connection : NULL;
+ conn_reconfig_start(connection);
+
+ /* if you want to reconfigure, please tear down first */
+ if (device->state.disk > D_DISKLESS) {
+ retcode = ERR_DISK_CONFIGURED;
+ goto fail;
+ }
+ /* It may just now have detached because of IO error. Make sure
+ * drbd_ldev_destroy is done already, we may end up here very fast,
+ * e.g. if someone calls attach from the on-io-error handler,
+ * to realize a "hot spare" feature (not that I'd recommend that) */
+ wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
+
+ /* make sure there is no leftover from previous force-detach attempts */
+ clear_bit(FORCE_DETACH, &device->flags);
+ clear_bit(WAS_IO_ERROR, &device->flags);
+ clear_bit(WAS_READ_ERROR, &device->flags);
+
+ /* and no leftover from previously aborted resync or verify, either */
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+
+ /* allocation not in the IO path, drbdsetup context */
+ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+ if (!nbc) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ spin_lock_init(&nbc->md.uuid_lock);
+
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ nbc->disk_conf = new_disk_conf;
+
+ set_disk_conf_defaults(new_disk_conf);
+ err = disk_conf_from_attrs(new_disk_conf, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+ new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+ if (!new_plan) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+ retcode = ERR_MD_IDX_INVALID;
+ goto fail;
+ }
+
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+ write_unlock_irq(&global_state_lock);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (nc) {
+ if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+ rcu_read_unlock();
+ retcode = ERR_STONITH_AND_PROT_A;
+ goto fail;
+ }
+ }
+ rcu_read_unlock();
+
+ bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
+ if (IS_ERR(bdev)) {
+ drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
+ PTR_ERR(bdev));
+ retcode = ERR_OPEN_DISK;
+ goto fail;
+ }
+ nbc->backing_bdev = bdev;
+
+ /*
+ * meta_dev_idx >= 0: external fixed size, possibly multiple
+ * drbd sharing one meta device. TODO in that case, paranoia
+ * check that [md_bdev, meta_dev_idx] is not yet used by some
+ * other drbd minor! (if you use drbd.conf + drbdadm, that
+ * should check it for you already; but if you don't, or
+ * someone fooled it, we need to double check here)
+ */
+ bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ (new_disk_conf->meta_dev_idx < 0) ?
+ (void *)device : (void *)drbd_m_holder);
+ if (IS_ERR(bdev)) {
+ drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
+ PTR_ERR(bdev));
+ retcode = ERR_OPEN_MD_DISK;
+ goto fail;
+ }
+ nbc->md_bdev = bdev;
+
+ if ((nbc->backing_bdev == nbc->md_bdev) !=
+ (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+ retcode = ERR_MD_IDX_INVALID;
+ goto fail;
+ }
+
+ resync_lru = lc_create("resync", drbd_bm_ext_cache,
+ 1, 61, sizeof(struct bm_extent),
+ offsetof(struct bm_extent, lce));
+ if (!resync_lru) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ /* Read our meta data super block early.
+ * This also sets other on-disk offsets. */
+ retcode = drbd_md_read(device, nbc);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+ new_disk_conf->al_extents = drbd_al_extents_max(nbc);
+
+ if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
+ drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
+ (unsigned long long) drbd_get_max_capacity(nbc),
+ (unsigned long long) new_disk_conf->disk_size);
+ retcode = ERR_DISK_TOO_SMALL;
+ goto fail;
+ }
+
+ if (new_disk_conf->meta_dev_idx < 0) {
+ max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+ /* at least one MB, otherwise it does not make sense */
+ min_md_device_sectors = (2<<10);
+ } else {
+ max_possible_sectors = DRBD_MAX_SECTORS;
+ min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
+ }
+
+ if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+ retcode = ERR_MD_DISK_TOO_SMALL;
+ drbd_warn(device, "refusing attach: md-device too small, "
+ "at least %llu sectors needed for this meta-disk type\n",
+ (unsigned long long) min_md_device_sectors);
+ goto fail;
+ }
+
+ /* Make sure the new disk is big enough
+ * (we may currently be R_PRIMARY with no local disk...) */
+ if (drbd_get_max_capacity(nbc) <
+ drbd_get_capacity(device->this_bdev)) {
+ retcode = ERR_DISK_TOO_SMALL;
+ goto fail;
+ }
+
+ nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+ if (nbc->known_size > max_possible_sectors) {
+ drbd_warn(device, "==> truncating very big lower level device "
+ "to currently maximum possible %llu sectors <==\n",
+ (unsigned long long) max_possible_sectors);
+ if (new_disk_conf->meta_dev_idx >= 0)
+ drbd_warn(device, "==>> using internal or flexible "
+ "meta data may help <<==\n");
+ }
+
+ drbd_suspend_io(device);
+ /* also wait for the last barrier ack. */
+ /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+ * We need a way to either ignore barrier acks for barriers sent before a device
+ * was attached, or a way to wait for all pending barrier acks to come in.
+ * As barriers are counted per resource,
+ * we'd need to suspend io on all devices of a resource.
+ */
+ wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
+ /* and for any other previously queued work */
+ drbd_flush_workqueue(&connection->sender_work);
+
+ rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
+ retcode = rv; /* FIXME: Type mismatch. */
+ drbd_resume_io(device);
+ if (rv < SS_SUCCESS)
+ goto fail;
+
+ if (!get_ldev_if_state(device, D_ATTACHING))
+ goto force_diskless;
+
+ if (!device->bitmap) {
+ if (drbd_bm_init(device)) {
+ retcode = ERR_NOMEM;
+ goto force_diskless_dec;
+ }
+ }
+
+ if (device->state.conn < C_CONNECTED &&
+ device->state.role == R_PRIMARY && device->ed_uuid &&
+ (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+ drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
+ (unsigned long long)device->ed_uuid);
+ retcode = ERR_DATA_NOT_CURRENT;
+ goto force_diskless_dec;
+ }
+
+ /* Since we are diskless, fix the activity log first... */
+ if (drbd_check_al_size(device, new_disk_conf)) {
+ retcode = ERR_NOMEM;
+ goto force_diskless_dec;
+ }
+
+ /* Prevent shrinking of consistent devices ! */
+ if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
+ drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
+ drbd_warn(device, "refusing to truncate a consistent device\n");
+ retcode = ERR_DISK_TOO_SMALL;
+ goto force_diskless_dec;
+ }
+
+ /* Reset the "barriers don't work" bits here, then force meta data to
+ * be written, to ensure we determine if barriers are supported. */
+ if (new_disk_conf->md_flushes)
+ clear_bit(MD_NO_FUA, &device->flags);
+ else
+ set_bit(MD_NO_FUA, &device->flags);
+
+ /* Point of no return reached.
+ * Devices and memory are no longer released by error cleanup below.
+ * now device takes over responsibility, and the state engine should
+ * clean it up somewhere. */
+ D_ASSERT(device, device->ldev == NULL);
+ device->ldev = nbc;
+ device->resync = resync_lru;
+ device->rs_plan_s = new_plan;
+ nbc = NULL;
+ resync_lru = NULL;
+ new_disk_conf = NULL;
+ new_plan = NULL;
+
+ drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
+
+ if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
+ set_bit(CRASHED_PRIMARY, &device->flags);
+ else
+ clear_bit(CRASHED_PRIMARY, &device->flags);
+
+ if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+ !(device->state.role == R_PRIMARY && device->resource->susp_nod))
+ set_bit(CRASHED_PRIMARY, &device->flags);
+
+ device->send_cnt = 0;
+ device->recv_cnt = 0;
+ device->read_cnt = 0;
+ device->writ_cnt = 0;
+
+ drbd_reconsider_max_bio_size(device, device->ldev);
+
+ /* If I am currently not R_PRIMARY,
+ * but meta data primary indicator is set,
+ * I just now recover from a hard crash,
+ * and have been R_PRIMARY before that crash.
+ *
+ * Now, if I had no connection before that crash
+ * (have been degraded R_PRIMARY), chances are that
+ * I won't find my peer now either.
+ *
+ * In that case, and _only_ in that case,
+ * we use the degr-wfc-timeout instead of the default,
+ * so we can automatically recover from a crash of a
+ * degraded but active "cluster" after a certain timeout.
+ */
+ clear_bit(USE_DEGR_WFC_T, &device->flags);
+ if (device->state.role != R_PRIMARY &&
+ drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+ !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
+ set_bit(USE_DEGR_WFC_T, &device->flags);
+
+ dd = drbd_determine_dev_size(device, 0, NULL);
+ if (dd <= DS_ERROR) {
+ retcode = ERR_NOMEM_BITMAP;
+ goto force_diskless_dec;
+ } else if (dd == DS_GREW)
+ set_bit(RESYNC_AFTER_NEG, &device->flags);
+
+ if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
+ (test_bit(CRASHED_PRIMARY, &device->flags) &&
+ drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
+ drbd_info(device, "Assuming that all blocks are out of sync "
+ "(aka FullSync)\n");
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+ "set_n_write from attaching", BM_LOCKED_MASK)) {
+ retcode = ERR_IO_MD_DISK;
+ goto force_diskless_dec;
+ }
+ } else {
+ if (drbd_bitmap_io(device, &drbd_bm_read,
+ "read from attaching", BM_LOCKED_MASK)) {
+ retcode = ERR_IO_MD_DISK;
+ goto force_diskless_dec;
+ }
+ }
+
+ if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
+ drbd_suspend_al(device); /* IO is still suspended here... */
+
+ spin_lock_irq(&device->resource->req_lock);
+ os = drbd_read_state(device);
+ ns = os;
+ /* If MDF_CONSISTENT is not set go into inconsistent state,
+ otherwise investigate MDF_WasUpToDate...
+ If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+ otherwise into D_CONSISTENT state.
+ */
+ if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
+ if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
+ ns.disk = D_CONSISTENT;
+ else
+ ns.disk = D_OUTDATED;
+ } else {
+ ns.disk = D_INCONSISTENT;
+ }
+
+ if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
+ ns.pdsk = D_OUTDATED;
+
+ rcu_read_lock();
+ if (ns.disk == D_CONSISTENT &&
+ (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
+ ns.disk = D_UP_TO_DATE;
+
+ /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+ MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+ this point, because drbd_request_state() modifies these
+ flags. */
+
+ if (rcu_dereference(device->ldev->disk_conf)->al_updates)
+ device->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ device->ldev->md.flags |= MDF_AL_DISABLED;
+
+ rcu_read_unlock();
+
+ /* In case we are C_CONNECTED postpone any decision on the new disk
+ state after the negotiation phase. */
+ if (device->state.conn == C_CONNECTED) {
+ device->new_state_tmp.i = ns.i;
+ ns.i = os.i;
+ ns.disk = D_NEGOTIATING;
+
+ /* We expect to receive up-to-date UUIDs soon.
+ To avoid a race in receive_state, free p_uuid while
+ holding req_lock. I.e. atomic with the state change */
+ kfree(device->p_uuid);
+ device->p_uuid = NULL;
+ }
+
+ rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (rv < SS_SUCCESS)
+ goto force_diskless_dec;
+
+ mod_timer(&device->request_timer, jiffies + HZ);
+
+ if (device->state.role == R_PRIMARY)
+ device->ldev->md.uuid[UI_CURRENT] |= (u64)1;
+ else
+ device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+ drbd_md_mark_dirty(device);
+ drbd_md_sync(device);
+
+ kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+ put_ldev(device);
+ conn_reconfig_done(connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+
+ force_diskless_dec:
+ put_ldev(device);
+ force_diskless:
+ drbd_force_state(device, NS(disk, D_DISKLESS));
+ drbd_md_sync(device);
+ fail:
+ conn_reconfig_done(connection);
+ if (nbc) {
+ if (nbc->backing_bdev)
+ blkdev_put(nbc->backing_bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ if (nbc->md_bdev)
+ blkdev_put(nbc->md_bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ kfree(nbc);
+ }
+ kfree(new_disk_conf);
+ lc_destroy(resync_lru);
+ kfree(new_plan);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int adm_detach(struct drbd_device *device, int force)
+{
+ enum drbd_state_rv retcode;
+ int ret;
+
+ if (force) {
+ set_bit(FORCE_DETACH, &device->flags);
+ drbd_force_state(device, NS(disk, D_FAILED));
+ retcode = SS_SUCCESS;
+ goto out;
+ }
+
+ drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
+ drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
+ retcode = drbd_request_state(device, NS(disk, D_FAILED));
+ drbd_md_put_buffer(device);
+ /* D_FAILED will transition to DISKLESS. */
+ ret = wait_event_interruptible(device->misc_wait,
+ device->state.disk != D_FAILED);
+ drbd_resume_io(device);
+ if ((int)retcode == (int)SS_IS_DISKLESS)
+ retcode = SS_NOTHING_TO_DO;
+ if (ret)
+ retcode = ERR_INTR;
+out:
+ return retcode;
+}
+
+/* Detaching the disk is a process in multiple stages. First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct detach_parms parms = { };
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+ err = detach_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = adm_detach(adm_ctx.device, parms.force_detach);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static bool conn_resync_running(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.conn == C_SYNC_SOURCE ||
+ device->state.conn == C_SYNC_TARGET ||
+ device->state.conn == C_PAUSED_SYNC_S ||
+ device->state.conn == C_PAUSED_SYNC_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static bool conn_ov_running(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.conn == C_VERIFY_S ||
+ device->state.conn == C_VERIFY_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static enum drbd_ret_code
+_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf)
+{
+ struct drbd_peer_device *peer_device;
+ int i;
+
+ if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
+ if (new_net_conf->wire_protocol != old_net_conf->wire_protocol)
+ return ERR_NEED_APV_100;
+
+ if (new_net_conf->two_primaries != old_net_conf->two_primaries)
+ return ERR_NEED_APV_100;
+
+ if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg))
+ return ERR_NEED_APV_100;
+ }
+
+ if (!new_net_conf->two_primaries &&
+ conn_highest_role(connection) == R_PRIMARY &&
+ conn_highest_peer(connection) == R_PRIMARY)
+ return ERR_NEED_ALLOW_TWO_PRI;
+
+ if (new_net_conf->two_primaries &&
+ (new_net_conf->wire_protocol != DRBD_PROT_C))
+ return ERR_NOT_PROTO_C;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ if (get_ldev(device)) {
+ enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ put_ldev(device);
+ if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+ return ERR_STONITH_AND_PROT_A;
+ }
+ if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data)
+ return ERR_DISCARD_IMPOSSIBLE;
+ }
+
+ if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A)
+ return ERR_CONG_NOT_PROTO_A;
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf)
+{
+ static enum drbd_ret_code rv;
+ struct drbd_peer_device *peer_device;
+ int i;
+
+ rcu_read_lock();
+ rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
+ rcu_read_unlock();
+
+ /* connection->peer_devices protected by genl_lock() here */
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ if (!device->bitmap) {
+ if (drbd_bm_init(device))
+ return ERR_NOMEM;
+ }
+ }
+
+ return rv;
+}
+
+struct crypto {
+ struct crypto_hash *verify_tfm;
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm;
+};
+
+static int
+alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+{
+ if (!tfm_name[0])
+ return NO_ERROR;
+
+ *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(*tfm)) {
+ *tfm = NULL;
+ return err_alg;
+ }
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf)
+{
+ char hmac_name[CRYPTO_MAX_ALG_NAME];
+ enum drbd_ret_code rv;
+
+ rv = alloc_hash(&crypto->csums_tfm, new_net_conf->csums_alg,
+ ERR_CSUMS_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->verify_tfm, new_net_conf->verify_alg,
+ ERR_VERIFY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->integrity_tfm, new_net_conf->integrity_alg,
+ ERR_INTEGRITY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ if (new_net_conf->cram_hmac_alg[0] != 0) {
+ snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+ new_net_conf->cram_hmac_alg);
+
+ rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+ ERR_AUTH_ALG);
+ }
+
+ return rv;
+}
+
+static void free_crypto(struct crypto *crypto)
+{
+ crypto_free_hash(crypto->cram_hmac_tfm);
+ crypto_free_hash(crypto->integrity_tfm);
+ crypto_free_hash(crypto->csums_tfm);
+ crypto_free_hash(crypto->verify_tfm);
+}
+
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct drbd_connection *connection;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ int err;
+ int ovr; /* online verify running */
+ int rsr; /* re-sync running */
+ struct crypto crypto = { };
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ connection = adm_ctx.connection;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ retcode = ERR_NOMEM;
+ goto out;
+ }
+
+ conn_reconfig_start(connection);
+
+ mutex_lock(&connection->data.mutex);
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = connection->net_conf;
+
+ if (!old_net_conf) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect");
+ retcode = ERR_INVALID_REQUEST;
+ goto fail;
+ }
+
+ *new_net_conf = *old_net_conf;
+ if (should_set_defaults(info))
+ set_net_conf_defaults(new_net_conf);
+
+ err = net_conf_from_attrs_for_change(new_net_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ retcode = check_net_options(connection, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ /* re-sync running */
+ rsr = conn_resync_running(connection);
+ if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) {
+ retcode = ERR_CSUMS_RESYNC_RUNNING;
+ goto fail;
+ }
+
+ /* online verify running */
+ ovr = conn_ov_running(connection);
+ if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) {
+ retcode = ERR_VERIFY_RUNNING;
+ goto fail;
+ }
+
+ retcode = alloc_crypto(&crypto, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+ if (!rsr) {
+ crypto_free_hash(connection->csums_tfm);
+ connection->csums_tfm = crypto.csums_tfm;
+ crypto.csums_tfm = NULL;
+ }
+ if (!ovr) {
+ crypto_free_hash(connection->verify_tfm);
+ connection->verify_tfm = crypto.verify_tfm;
+ crypto.verify_tfm = NULL;
+ }
+
+ crypto_free_hash(connection->integrity_tfm);
+ connection->integrity_tfm = crypto.integrity_tfm;
+ if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
+ /* Do this without trying to take connection->data.mutex again. */
+ __drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
+
+ crypto_free_hash(connection->cram_hmac_tfm);
+ connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+ synchronize_rcu();
+ kfree(old_net_conf);
+
+ if (connection->cstate >= C_WF_REPORT_PARAMS) {
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ drbd_send_sync_param(peer_device);
+ }
+
+ goto done;
+
+ fail:
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+ free_crypto(&crypto);
+ kfree(new_net_conf);
+ done:
+ conn_reconfig_done(connection);
+ out:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_peer_device *peer_device;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ struct crypto crypto = { };
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ enum drbd_ret_code retcode;
+ int i;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+ if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+
+ /* No need for _rcu here. All reconfiguration is
+ * strictly serialized on genl_lock(). We are protected against
+ * concurrent reconfiguration/addition/deletion */
+ for_each_resource(resource, &drbd_resources) {
+ for_each_connection(connection, resource) {
+ if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
+ !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr,
+ connection->my_addr_len)) {
+ retcode = ERR_LOCAL_ADDR;
+ goto out;
+ }
+
+ if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
+ !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr,
+ connection->peer_addr_len)) {
+ retcode = ERR_PEER_ADDR;
+ goto out;
+ }
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ connection = first_connection(adm_ctx.resource);
+ conn_reconfig_start(connection);
+
+ if (connection->cstate > C_STANDALONE) {
+ retcode = ERR_NET_CONFIGURED;
+ goto fail;
+ }
+
+ /* allocation not in the IO path, drbdsetup / netlink process context */
+ new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ set_net_conf_defaults(new_net_conf);
+
+ err = net_conf_from_attrs(new_net_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ retcode = check_net_options(connection, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ retcode = alloc_crypto(&crypto, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ ((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+ drbd_flush_workqueue(&connection->sender_work);
+
+ mutex_lock(&adm_ctx.resource->conf_update);
+ old_net_conf = connection->net_conf;
+ if (old_net_conf) {
+ retcode = ERR_NET_CONFIGURED;
+ mutex_unlock(&adm_ctx.resource->conf_update);
+ goto fail;
+ }
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+ conn_free_crypto(connection);
+ connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+ connection->integrity_tfm = crypto.integrity_tfm;
+ connection->csums_tfm = crypto.csums_tfm;
+ connection->verify_tfm = crypto.verify_tfm;
+
+ connection->my_addr_len = nla_len(adm_ctx.my_addr);
+ memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
+ connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
+ memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
+
+ mutex_unlock(&adm_ctx.resource->conf_update);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ device->send_cnt = 0;
+ device->recv_cnt = 0;
+ }
+ rcu_read_unlock();
+
+ retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ conn_reconfig_done(connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+
+fail:
+ free_crypto(&crypto);
+ kfree(new_net_conf);
+
+ conn_reconfig_done(connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
+{
+ enum drbd_state_rv rv;
+
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+ force ? CS_HARD : 0);
+
+ switch (rv) {
+ case SS_NOTHING_TO_DO:
+ break;
+ case SS_ALREADY_STANDALONE:
+ return SS_SUCCESS;
+ case SS_PRIMARY_NOP:
+ /* Our state checking code wants to see the peer outdated. */
+ rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
+
+ if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
+
+ break;
+ case SS_CW_FAILED_BY_PEER:
+ /* The peer probably wants to see us outdated. */
+ rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
+ disk, D_OUTDATED), 0);
+ if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+ CS_HARD);
+ }
+ break;
+ default:;
+ /* no special handling necessary */
+ }
+
+ if (rv >= SS_SUCCESS) {
+ enum drbd_state_rv rv2;
+ /* No one else can reconfigure the network while I am here.
+ * The state handling only uses drbd_thread_stop_nowait(),
+ * we want to really wait here until the receiver is no more.
+ */
+ drbd_thread_stop(&connection->receiver);
+
+ /* Race breaker. This additional state change request may be
+ * necessary, if this was a forced disconnect during a receiver
+ * restart. We may have "killed" the receiver thread just
+ * after drbd_receiver() returned. Typically, we should be
+ * C_STANDALONE already, now, and this becomes a no-op.
+ */
+ rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
+ CS_VERBOSE | CS_HARD);
+ if (rv2 < SS_SUCCESS)
+ drbd_err(connection,
+ "unexpected rv2=%d in conn_try_disconnect()\n",
+ rv2);
+ }
+ return rv;
+}
+
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct disconnect_parms parms;
+ struct drbd_connection *connection;
+ enum drbd_state_rv rv;
+ enum drbd_ret_code retcode;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ connection = adm_ctx.connection;
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+ err = disconnect_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ rv = conn_try_disconnect(connection, parms.force_disconnect);
+ if (rv < SS_SUCCESS)
+ retcode = rv; /* FIXME: Type mismatch. */
+ else
+ retcode = NO_ERROR;
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ fail:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+void resync_after_online_grow(struct drbd_device *device)
+{
+ int iass; /* I am sync source */
+
+ drbd_info(device, "Resync of new storage after online grow\n");
+ if (device->state.role != device->state.peer)
+ iass = (device->state.role == R_PRIMARY);
+ else
+ iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+
+ if (iass)
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ else
+ _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+ struct resize_parms rs;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
+ enum determine_dev_size dd;
+ bool change_al_layout = false;
+ enum dds_flags ddsf;
+ sector_t u_size;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ device = adm_ctx.device;
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto fail;
+ }
+
+ memset(&rs, 0, sizeof(struct resize_parms));
+ rs.al_stripes = device->ldev->md.al_stripes;
+ rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
+ if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+ err = resize_parms_from_attrs(&rs, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail_ldev;
+ }
+ }
+
+ if (device->state.conn > C_CONNECTED) {
+ retcode = ERR_RESIZE_RESYNC;
+ goto fail_ldev;
+ }
+
+ if (device->state.role == R_SECONDARY &&
+ device->state.peer == R_SECONDARY) {
+ retcode = ERR_NO_PRIMARY;
+ goto fail_ldev;
+ }
+
+ if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
+ retcode = ERR_NEED_APV_93;
+ goto fail_ldev;
+ }
+
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ if (u_size != (sector_t)rs.resize_size) {
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail_ldev;
+ }
+ }
+
+ if (device->ldev->md.al_stripes != rs.al_stripes ||
+ device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
+ u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
+
+ if (al_size_k > (16 * 1024 * 1024)) {
+ retcode = ERR_MD_LAYOUT_TOO_BIG;
+ goto fail_ldev;
+ }
+
+ if (al_size_k < MD_32kB_SECT/2) {
+ retcode = ERR_MD_LAYOUT_TOO_SMALL;
+ goto fail_ldev;
+ }
+
+ if (device->state.conn != C_CONNECTED && !rs.resize_force) {
+ retcode = ERR_MD_LAYOUT_CONNECTED;
+ goto fail_ldev;
+ }
+
+ change_al_layout = true;
+ }
+
+ if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
+ device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+
+ if (new_disk_conf) {
+ mutex_lock(&device->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = (sector_t)rs.resize_size;
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&device->resource->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ }
+
+ ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+ dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
+ drbd_md_sync(device);
+ put_ldev(device);
+ if (dd == DS_ERROR) {
+ retcode = ERR_NOMEM_BITMAP;
+ goto fail;
+ } else if (dd == DS_ERROR_SPACE_MD) {
+ retcode = ERR_MD_LAYOUT_NO_FIT;
+ goto fail;
+ } else if (dd == DS_ERROR_SHRINK) {
+ retcode = ERR_IMPLICIT_SHRINK;
+ goto fail;
+ }
+
+ if (device->state.conn == C_CONNECTED) {
+ if (dd == DS_GREW)
+ set_bit(RESIZE_PENDING, &device->flags);
+
+ drbd_send_uuids(first_peer_device(device));
+ drbd_send_sizes(first_peer_device(device), 1, ddsf);
+ }
+
+ fail:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+
+ fail_ldev:
+ put_ldev(device);
+ goto fail;
+}
+
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct res_opts res_opts;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ res_opts = adm_ctx.resource->res_opts;
+ if (should_set_defaults(info))
+ set_res_opts_defaults(&res_opts);
+
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ err = set_resource_options(adm_ctx.resource, &res_opts);
+ if (err) {
+ retcode = ERR_INVALID_REQUEST;
+ if (err == -ENOMEM)
+ retcode = ERR_NOMEM;
+ }
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+
+fail:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ device = adm_ctx.device;
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* If there is still bitmap IO pending, probably because of a previous
+ * resync just being finished, wait for it before requesting a new resync.
+ * Also wait for it's after_state_ch(). */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+ /* If we happen to be C_STANDALONE R_SECONDARY, just change to
+ * D_INCONSISTENT, and set all bits in the bitmap. Otherwise,
+ * try to start a resync handshake as sync target for full sync.
+ */
+ if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
+ retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+ "set_n_write from invalidate", BM_LOCKED_MASK))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ put_ldev(device);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+ union drbd_state mask, union drbd_state val)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = drbd_request_state(adm_ctx.device, mask, val);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
+{
+ int rv;
+
+ rv = drbd_bmio_set_n_write(device);
+ drbd_suspend_al(device);
+ return rv;
+}
+
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ int retcode; /* drbd_ret_code, drbd_state_rv */
+ struct drbd_device *device;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ device = adm_ctx.device;
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* If there is still bitmap IO pending, probably because of a previous
+ * resync just being finished, wait for it before requesting a new resync.
+ * Also wait for it's after_state_ch(). */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+ /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
+ * in the bitmap. Otherwise, try to start a resync handshake
+ * as sync source for full sync.
+ */
+ if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
+ /* The peer will get a resync upon connect anyways. Just make that
+ into a full resync. */
+ retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
+ "set_n_write from invalidate_peer",
+ BM_LOCKED_SET_ALLOWED))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ put_ldev(device);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+ retcode = ERR_PAUSE_IS_SET;
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ union drbd_dev_state s;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+ s = adm_ctx.device->state;
+ if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
+ retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
+ s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
+ } else {
+ retcode = ERR_PAUSE_IS_CLEAR;
+ }
+ }
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
+{
+ return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
+}
+
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ device = adm_ctx.device;
+ if (test_bit(NEW_CUR_UUID, &device->flags)) {
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ }
+ drbd_suspend_io(device);
+ retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+ if (retcode == SS_SUCCESS) {
+ if (device->state.conn < C_CONNECTED)
+ tl_clear(first_peer_device(device)->connection);
+ if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
+ tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
+ }
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
+{
+ return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
+}
+
+static int nla_put_drbd_cfg_context(struct sk_buff *skb,
+ struct drbd_resource *resource,
+ struct drbd_connection *connection,
+ struct drbd_device *device)
+{
+ struct nlattr *nla;
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+ if (!nla)
+ goto nla_put_failure;
+ if (device &&
+ nla_put_u32(skb, T_ctx_volume, device->vnr))
+ goto nla_put_failure;
+ if (nla_put_string(skb, T_ctx_resource_name, resource->name))
+ goto nla_put_failure;
+ if (connection) {
+ if (connection->my_addr_len &&
+ nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
+ goto nla_put_failure;
+ if (connection->peer_addr_len &&
+ nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nla);
+ return 0;
+
+nla_put_failure:
+ if (nla)
+ nla_nest_cancel(skb, nla);
+ return -EMSGSIZE;
+}
+
+/*
+ * Return the connection of @resource if @resource has exactly one connection.
+ */
+static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
+{
+ struct list_head *connections = &resource->connections;
+
+ if (list_empty(connections) || connections->next->next != connections)
+ return NULL;
+ return list_first_entry(&resource->connections, struct drbd_connection, connections);
+}
+
+static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+ const struct sib_info *sib)
+{
+ struct drbd_resource *resource = device->resource;
+ struct state_info *si = NULL; /* for sizeof(si->member); */
+ struct nlattr *nla;
+ int got_ldev;
+ int err = 0;
+ int exclude_sensitive;
+
+ /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+ * to. So we better exclude_sensitive information.
+ *
+ * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+ * in the context of the requesting user process. Exclude sensitive
+ * information, unless current has superuser.
+ *
+ * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+ * relies on the current implementation of netlink_dump(), which
+ * executes the dump callback successively from netlink_recvmsg(),
+ * always in the context of the receiving process */
+ exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+ got_ldev = get_ldev(device);
+
+ /* We need to add connection name and volume number information still.
+ * Minor number is in drbd_genlmsghdr. */
+ if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device))
+ goto nla_put_failure;
+
+ if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ if (got_ldev) {
+ struct disk_conf *disk_conf;
+
+ disk_conf = rcu_dereference(device->ldev->disk_conf);
+ err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
+ }
+ if (!err) {
+ struct net_conf *nc;
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (nc)
+ err = net_conf_to_skb(skb, nc, exclude_sensitive);
+ }
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+
+ nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+ if (!nla)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+ nla_put_u32(skb, T_current_state, device->state.i) ||
+ nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
+ nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
+ nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
+ nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
+ nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
+ nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
+ nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
+ nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
+ nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
+ nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
+ nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
+ goto nla_put_failure;
+
+ if (got_ldev) {
+ int err;
+
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ if (err)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
+ nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
+ nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
+ goto nla_put_failure;
+ if (C_SYNC_SOURCE <= device->state.conn &&
+ C_PAUSED_SYNC_T >= device->state.conn) {
+ if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
+ nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
+ goto nla_put_failure;
+ }
+ }
+
+ if (sib) {
+ switch(sib->sib_reason) {
+ case SIB_SYNC_PROGRESS:
+ case SIB_GET_STATUS_REPLY:
+ break;
+ case SIB_STATE_CHANGE:
+ if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+ nla_put_u32(skb, T_new_state, sib->ns.i))
+ goto nla_put_failure;
+ break;
+ case SIB_HELPER_POST:
+ if (nla_put_u32(skb, T_helper_exit_code,
+ sib->helper_exit_code))
+ goto nla_put_failure;
+ /* fall through */
+ case SIB_HELPER_PRE:
+ if (nla_put_string(skb, T_helper, sib->helper_name))
+ goto nla_put_failure;
+ break;
+ }
+ }
+ nla_nest_end(skb, nla);
+
+ if (0)
+nla_put_failure:
+ err = -EMSGSIZE;
+ if (got_ldev)
+ put_ldev(device);
+ return err;
+}
+
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
+ }
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct drbd_device *device;
+ struct drbd_genlmsghdr *dh;
+ struct drbd_resource *pos = (struct drbd_resource *)cb->args[0];
+ struct drbd_resource *resource = NULL;
+ struct drbd_resource *tmp;
+ unsigned volume = cb->args[1];
+
+ /* Open coded, deferred, iteration:
+ * for_each_resource_safe(resource, tmp, &drbd_resources) {
+ * connection = "first connection of resource or undefined";
+ * idr_for_each_entry(&resource->devices, device, i) {
+ * ...
+ * }
+ * }
+ * where resource is cb->args[0];
+ * and i is cb->args[1];
+ *
+ * cb->args[2] indicates if we shall loop over all resources,
+ * or just dump all volumes of a single resource.
+ *
+ * This may miss entries inserted after this dump started,
+ * or entries deleted before they are reached.
+ *
+ * We need to make sure the device won't disappear while
+ * we are looking at it, and revalidate our iterators
+ * on each iteration.
+ */
+
+ /* synchronize with conn_create()/drbd_destroy_connection() */
+ rcu_read_lock();
+ /* revalidate iterator position */
+ for_each_resource_rcu(tmp, &drbd_resources) {
+ if (pos == NULL) {
+ /* first iteration */
+ pos = tmp;
+ resource = pos;
+ break;
+ }
+ if (tmp == pos) {
+ resource = pos;
+ break;
+ }
+ }
+ if (resource) {
+next_resource:
+ device = idr_get_next(&resource->devices, &volume);
+ if (!device) {
+ /* No more volumes to dump on this resource.
+ * Advance resource iterator. */
+ pos = list_entry_rcu(resource->resources.next,
+ struct drbd_resource, resources);
+ /* Did we dump any volume of this resource yet? */
+ if (volume != 0) {
+ /* If we reached the end of the list,
+ * or only a single resource dump was requested,
+ * we are done. */
+ if (&pos->resources == &drbd_resources || cb->args[2])
+ goto out;
+ volume = 0;
+ resource = pos;
+ goto next_resource;
+ }
+ }
+
+ dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &drbd_genl_family,
+ NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+ if (!dh)
+ goto out;
+
+ if (!device) {
+ /* This is a connection without a single volume.
+ * Suprisingly enough, it may have a network
+ * configuration. */
+ struct drbd_connection *connection;
+
+ dh->minor = -1U;
+ dh->ret_code = NO_ERROR;
+ connection = the_only_connection(resource);
+ if (nla_put_drbd_cfg_context(skb, resource, connection, NULL))
+ goto cancel;
+ if (connection) {
+ struct net_conf *nc;
+
+ nc = rcu_dereference(connection->net_conf);
+ if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+ goto cancel;
+ }
+ goto done;
+ }
+
+ D_ASSERT(device, device->vnr == volume);
+ D_ASSERT(device, device->resource == resource);
+
+ dh->minor = device_to_minor(device);
+ dh->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(skb, device, NULL)) {
+cancel:
+ genlmsg_cancel(skb, dh);
+ goto out;
+ }
+done:
+ genlmsg_end(skb, dh);
+ }
+
+out:
+ rcu_read_unlock();
+ /* where to start the next iteration */
+ cb->args[0] = (long)pos;
+ cb->args[1] = (pos == resource) ? volume + 1 : 0;
+
+ /* No more resources/volumes/minors found results in an empty skb.
+ * Which will terminate the dump. */
+ return skb->len;
+}
+
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock(). During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
+ */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+ struct nlattr *nla;
+ const char *resource_name;
+ struct drbd_resource *resource;
+ int maxtype;
+
+ /* Is this a followup call? */
+ if (cb->args[0]) {
+ /* ... of a single resource dump,
+ * and the resource iterator has been advanced already? */
+ if (cb->args[2] && cb->args[2] != cb->args[0])
+ return 0; /* DONE. */
+ goto dump;
+ }
+
+ /* First call (from netlink_dump_start). We need to figure out
+ * which resource(s) the user wants us to dump. */
+ nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+ nlmsg_attrlen(cb->nlh, hdrlen),
+ DRBD_NLA_CFG_CONTEXT);
+
+ /* No explicit context given. Dump all. */
+ if (!nla)
+ goto dump;
+ maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+ nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+ if (IS_ERR(nla))
+ return PTR_ERR(nla);
+ /* context given, but no name present? */
+ if (!nla)
+ return -EINVAL;
+ resource_name = nla_data(nla);
+ if (!*resource_name)
+ return -ENODEV;
+ resource = drbd_find_resource(resource_name);
+ if (!resource)
+ return -ENODEV;
+
+ kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */
+
+ /* prime iterators, and set "filter" mode mark:
+ * only dump this connection. */
+ cb->args[0] = (long)resource;
+ /* cb->args[1] = 0; passed in this way. */
+ cb->args[2] = (long)resource;
+
+dump:
+ return get_one_status(skb, cb);
+}
+
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct timeout_parms tp;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ tp.timeout_type =
+ adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+ test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
+ UT_DEFAULT;
+
+ err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
+ }
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
+ struct start_ov_parms parms;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ device = adm_ctx.device;
+
+ /* resume from last known position, if possible */
+ parms.ov_start_sector = device->ov_start_sector;
+ parms.ov_stop_sector = ULLONG_MAX;
+ if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+ int err = start_ov_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* w_make_ov_request expects position to be aligned */
+ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+ device->ov_stop_sector = parms.ov_stop_sector;
+
+ /* If there is still bitmap IO pending, e.g. previous resync or verify
+ * just being finished, wait for it before requesting a new resync. */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
+ drbd_resume_io(device);
+
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
+ int skip_initial_sync = 0;
+ int err;
+ struct new_c_uuid_parms args;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out_nolock;
+
+ device = adm_ctx.device;
+ memset(&args, 0, sizeof(args));
+ if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+ err = new_c_uuid_parms_from_attrs(&args, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out_nolock;
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
+
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ /* this is "skip initial sync", assume to be clean */
+ if (device->state.conn == C_CONNECTED &&
+ first_peer_device(device)->connection->agreed_pro_version >= 90 &&
+ device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+ drbd_info(device, "Preparing to skip initial sync\n");
+ skip_initial_sync = 1;
+ } else if (device->state.conn != C_STANDALONE) {
+ retcode = ERR_CONNECTED;
+ goto out_dec;
+ }
+
+ drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+ drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
+
+ if (args.clear_bm) {
+ err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+ "clear_n_write from new_c_uuid", BM_LOCKED_MASK);
+ if (err) {
+ drbd_err(device, "Writing bitmap failed with %d\n", err);
+ retcode = ERR_IO_MD_DISK;
+ }
+ if (skip_initial_sync) {
+ drbd_send_uuids_skip_initial_sync(first_peer_device(device));
+ _drbd_uuid_set(device, UI_BITMAP, 0);
+ drbd_print_uuids(device, "cleared bitmap UUID");
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+ CS_VERBOSE, NULL);
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+ }
+
+ drbd_md_sync(device);
+out_dec:
+ put_ldev(device);
+out:
+ mutex_unlock(device->state_mutex);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out_nolock:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static enum drbd_ret_code
+drbd_check_resource_name(struct drbd_config_context *adm_ctx)
+{
+ const char *name = adm_ctx->resource_name;
+ if (!name || !name[0]) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing");
+ return ERR_MANDATORY_TAG;
+ }
+ /* if we want to use these in sysfs/configfs/debugfs some day,
+ * we must not allow slashes */
+ if (strchr(name, '/')) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name");
+ return ERR_INVALID_REQUEST;
+ }
+ return NO_ERROR;
+}
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct res_opts res_opts;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ set_res_opts_defaults(&res_opts);
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+
+ retcode = drbd_check_resource_name(&adm_ctx);
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (adm_ctx.resource) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+ retcode = ERR_INVALID_REQUEST;
+ drbd_msg_put_info(adm_ctx.reply_skb, "resource exists");
+ }
+ /* else: still NO_ERROR */
+ goto out;
+ }
+
+ /* not yet safe for genl_family.parallel_ops */
+ if (!conn_create(adm_ctx.resource_name, &res_opts))
+ retcode = ERR_NOMEM;
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_genlmsghdr *dh = info->userhdr;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (dh->minor > MINORMASK) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+ if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+
+ /* drbd_adm_prepare made sure already
+ * that first_peer_device(device)->connection and device->vnr match the request. */
+ if (adm_ctx.device) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+ retcode = ERR_MINOR_OR_VOLUME_EXISTS;
+ /* else: still NO_ERROR */
+ goto out;
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = drbd_create_device(&adm_ctx, dh->minor);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
+{
+ if (device->state.disk == D_DISKLESS &&
+ /* no need to be device->state.conn == C_STANDALONE &&
+ * we may want to delete a minor from a live replication group.
+ */
+ device->state.role == R_SECONDARY) {
+ _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
+ CS_VERBOSE + CS_WAIT_COMPLETE);
+ drbd_delete_device(device);
+ return NO_ERROR;
+ } else
+ return ERR_MINOR_CONFIGURED;
+}
+
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = adm_del_minor(adm_ctx.device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int adm_del_resource(struct drbd_resource *resource)
+{
+ struct drbd_connection *connection;
+
+ for_each_connection(connection, resource) {
+ if (connection->cstate > C_STANDALONE)
+ return ERR_NET_CONFIGURED;
+ }
+ if (!idr_is_empty(&resource->devices))
+ return ERR_RES_IN_USE;
+
+ list_del_rcu(&resource->resources);
+ /* Make sure all threads have actually stopped: state handling only
+ * does drbd_thread_stop_nowait(). */
+ list_for_each_entry(connection, &resource->connections, connections)
+ drbd_thread_stop(&connection->worker);
+ synchronize_rcu();
+ drbd_free_resource(resource);
+ return NO_ERROR;
+}
+
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+ unsigned i;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ resource = adm_ctx.resource;
+ mutex_lock(&resource->adm_mutex);
+ /* demote */
+ for_each_connection(connection, resource) {
+ struct drbd_peer_device *peer_device;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote");
+ goto out;
+ }
+ }
+
+ retcode = conn_try_disconnect(connection, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect");
+ goto out;
+ }
+ }
+
+ /* detach */
+ idr_for_each_entry(&resource->devices, device, i) {
+ retcode = adm_detach(device, 0);
+ if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach");
+ goto out;
+ }
+ }
+
+ /* delete volumes */
+ idr_for_each_entry(&resource->devices, device, i) {
+ retcode = adm_del_minor(device);
+ if (retcode != NO_ERROR) {
+ /* "can not happen" */
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume");
+ goto out;
+ }
+ }
+
+ retcode = adm_del_resource(resource);
+out:
+ mutex_unlock(&resource->adm_mutex);
+finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_resource *resource;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+ resource = adm_ctx.resource;
+
+ mutex_lock(&resource->adm_mutex);
+ retcode = adm_del_resource(resource);
+ mutex_unlock(&resource->adm_mutex);
+finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
+{
+ static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+ struct sk_buff *msg;
+ struct drbd_genlmsghdr *d_out;
+ unsigned seq;
+ int err = -ENOMEM;
+
+ seq = atomic_inc_return(&drbd_genl_seq);
+ msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+ if (!msg)
+ goto failed;
+
+ err = -EMSGSIZE;
+ d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+ if (!d_out) /* cannot happen, but anyways. */
+ goto nla_put_failure;
+ d_out->minor = device_to_minor(device);
+ d_out->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(msg, device, sib))
+ goto nla_put_failure;
+ genlmsg_end(msg, d_out);
+ err = drbd_genl_multicast_events(msg, 0);
+ /* msg has been consumed or freed in netlink_broadcast() */
+ if (err && err != -ESRCH)
+ goto failed;
+
+ return;
+
+nla_put_failure:
+ nlmsg_free(msg);
+failed:
+ drbd_err(device, "Error %d while broadcasting event. "
+ "Event seq:%u sib_reason:%u\n",
+ err, seq, sib->sib_reason);
+}
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 000000000..b2d479149
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+ struct nlattr *head = nla_data(nla);
+ int len = nla_len(nla);
+ int rem;
+
+ /*
+ * validate_nla (called from nla_parse_nested) ignores attributes
+ * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+ * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+ * flag set also, check and remove that flag before calling
+ * nla_parse_nested.
+ */
+
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+ nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+ if (nla_type(nla) > maxtype)
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy)
+{
+ int err;
+
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (!err)
+ err = nla_parse_nested(tb, maxtype, nla, policy);
+
+ return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+ int err;
+ /*
+ * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+ * we don't know about that attribute, reject all the nested
+ * attributes.
+ */
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (err)
+ return ERR_PTR(err);
+ return nla_find_nested(nla, attrtype);
+}
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 000000000..679c2d5b4
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,8 @@
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000..3b10fa6cb
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,368 @@
+/*
+ drbd_proc.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+static int drbd_proc_open(struct inode *inode, struct file *file);
+static int drbd_proc_release(struct inode *inode, struct file *file);
+
+
+struct proc_dir_entry *drbd_proc;
+const struct file_operations drbd_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = drbd_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = drbd_proc_release,
+};
+
+static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
+{
+ /* v is in kB/sec. We don't expect TiByte/sec yet. */
+ if (unlikely(v >= 1000000)) {
+ /* cool: > GiByte/s */
+ seq_printf(seq, "%ld,", v / 1000000);
+ v %= 1000000;
+ seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
+ } else if (likely(v >= 1000))
+ seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
+ else
+ seq_printf(seq, "%ld", v);
+}
+
+static void drbd_get_syncer_progress(struct drbd_device *device,
+ union drbd_dev_state state, unsigned long *rs_total,
+ unsigned long *bits_left, unsigned int *per_mil_done)
+{
+ /* this is to break it at compile time when we change that, in case we
+ * want to support more than (1<<32) bits on a 32bit arch. */
+ typecheck(unsigned long, device->rs_total);
+ *rs_total = device->rs_total;
+
+ /* note: both rs_total and rs_left are in bits, i.e. in
+ * units of BM_BLOCK_SIZE.
+ * for the percentage, we don't care. */
+
+ if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+ *bits_left = device->ov_left;
+ else
+ *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+ /* >> 10 to prevent overflow,
+ * +1 to prevent division by zero */
+ if (*bits_left > *rs_total) {
+ /* D'oh. Maybe a logic bug somewhere. More likely just a race
+ * between state change and reset of rs_total.
+ */
+ *bits_left = *rs_total;
+ *per_mil_done = *rs_total ? 0 : 1000;
+ } else {
+ /* Make sure the division happens in long context.
+ * We allow up to one petabyte storage right now,
+ * at a granularity of 4k per bit that is 2**38 bits.
+ * After shift right and multiplication by 1000,
+ * this should still fit easily into a 32bit long,
+ * so we don't need a 64bit division on 32bit arch.
+ * Note: currently we don't support such large bitmaps on 32bit
+ * arch anyways, but no harm done to be prepared for it here.
+ */
+ unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
+ unsigned long left = *bits_left >> shift;
+ unsigned long total = 1UL + (*rs_total >> shift);
+ unsigned long tmp = 1000UL - left * 1000UL/total;
+ *per_mil_done = tmp;
+ }
+}
+
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ * [=====>..............] 33.5% (23456/123456)
+ * finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
+ union drbd_dev_state state)
+{
+ unsigned long db, dt, dbdt, rt, rs_total, rs_left;
+ unsigned int res;
+ int i, x, y;
+ int stalled = 0;
+
+ drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
+
+ x = res/50;
+ y = 20-x;
+ seq_printf(seq, "\t[");
+ for (i = 1; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+
+ if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+ seq_printf(seq, "verified:");
+ else
+ seq_printf(seq, "sync'ed:");
+ seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
+
+ /* if more than a few GB, display in MB */
+ if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+ seq_printf(seq, "(%lu/%lu)M",
+ (unsigned long) Bit2KB(rs_left >> 10),
+ (unsigned long) Bit2KB(rs_total >> 10));
+ else
+ seq_printf(seq, "(%lu/%lu)K",
+ (unsigned long) Bit2KB(rs_left),
+ (unsigned long) Bit2KB(rs_total));
+
+ seq_printf(seq, "\n\t");
+
+ /* see drivers/md/md.c
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
+ * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+ * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+ /* ------------------------ ~18s average ------------------------ */
+ i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+ dt = (jiffies - device->rs_mark_time[i]) / HZ;
+ if (dt > 180)
+ stalled = 1;
+
+ if (!dt)
+ dt++;
+ db = device->rs_mark_left[i] - rs_left;
+ rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+ seq_printf(seq, "finish: %lu:%02lu:%02lu",
+ rt / 3600, (rt % 3600) / 60, rt % 60);
+
+ dbdt = Bit2KB(db/dt);
+ seq_printf(seq, " speed: ");
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, " (");
+ /* ------------------------- ~3s average ------------------------ */
+ if (proc_details >= 1) {
+ /* this is what drbd_rs_should_slow_down() uses */
+ i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+ dt = (jiffies - device->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = device->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, " -- ");
+ }
+
+ /* --------------------- long term average ---------------------- */
+ /* mean speed since syncer started
+ * we do account for PausedSync periods */
+ dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+ if (dt == 0)
+ dt = 1;
+ db = rs_total - rs_left;
+ dbdt = Bit2KB(db/dt);
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, ")");
+
+ if (state.conn == C_SYNC_TARGET ||
+ state.conn == C_VERIFY_S) {
+ seq_printf(seq, " want: ");
+ seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
+ }
+ seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
+
+ if (proc_details >= 1) {
+ /* 64 bit:
+ * we convert to sectors in the display below. */
+ unsigned long bm_bits = drbd_bm_bits(device);
+ unsigned long bit_pos;
+ unsigned long long stop_sector = 0;
+ if (state.conn == C_VERIFY_S ||
+ state.conn == C_VERIFY_T) {
+ bit_pos = bm_bits - device->ov_left;
+ if (verify_can_do_stop_sector(device))
+ stop_sector = device->ov_stop_sector;
+ } else
+ bit_pos = device->bm_resync_fo;
+ /* Total sectors may be slightly off for oddly
+ * sized devices. So what. */
+ seq_printf(seq,
+ "\t%3d%% sector pos: %llu/%llu",
+ (int)(bit_pos / (bm_bits/100+1)),
+ (unsigned long long)bit_pos * BM_SECT_PER_BIT,
+ (unsigned long long)bm_bits * BM_SECT_PER_BIT);
+ if (stop_sector != 0 && stop_sector != ULLONG_MAX)
+ seq_printf(seq, " stop sector: %llu", stop_sector);
+ seq_printf(seq, "\n");
+ }
+}
+
+static int drbd_seq_show(struct seq_file *seq, void *v)
+{
+ int i, prev_i = -1;
+ const char *sn;
+ struct drbd_device *device;
+ struct net_conf *nc;
+ union drbd_dev_state state;
+ char wp;
+
+ static char write_ordering_chars[] = {
+ [WO_none] = 'n',
+ [WO_drain_io] = 'd',
+ [WO_bdev_flush] = 'f',
+ };
+
+ seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+ API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+ /*
+ cs .. connection state
+ ro .. node role (local/remote)
+ ds .. disk state (local/remote)
+ protocol
+ various flags
+ ns .. network send
+ nr .. network receive
+ dw .. disk write
+ dr .. disk read
+ al .. activity log write count
+ bm .. bitmap update write count
+ pe .. pending (waiting for ack or data reply)
+ ua .. unack'd (still need to send ack or data reply)
+ ap .. application requests accepted, but not yet completed
+ ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+ wo .. write ordering mode currently in use
+ oos .. known out-of-sync kB
+ */
+
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, device, i) {
+ if (prev_i != i - 1)
+ seq_printf(seq, "\n");
+ prev_i = i;
+
+ state = device->state;
+ sn = drbd_conn_str(state.conn);
+
+ if (state.conn == C_STANDALONE &&
+ state.disk == D_DISKLESS &&
+ state.role == R_SECONDARY) {
+ seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+ } else {
+ /* reset device->congestion_reason */
+ bdi_rw_congested(&device->rq_queue->backing_dev_info);
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
+ seq_printf(seq,
+ "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
+ " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+ "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+ i, sn,
+ drbd_role_str(state.role),
+ drbd_role_str(state.peer),
+ drbd_disk_str(state.disk),
+ drbd_disk_str(state.pdsk),
+ wp,
+ drbd_suspended(device) ? 's' : 'r',
+ state.aftr_isp ? 'a' : '-',
+ state.peer_isp ? 'p' : '-',
+ state.user_isp ? 'u' : '-',
+ device->congestion_reason ?: '-',
+ test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
+ device->send_cnt/2,
+ device->recv_cnt/2,
+ device->writ_cnt/2,
+ device->read_cnt/2,
+ device->al_writ_cnt,
+ device->bm_writ_cnt,
+ atomic_read(&device->local_cnt),
+ atomic_read(&device->ap_pending_cnt) +
+ atomic_read(&device->rs_pending_cnt),
+ atomic_read(&device->unacked_cnt),
+ atomic_read(&device->ap_bio_cnt),
+ first_peer_device(device)->connection->epochs,
+ write_ordering_chars[device->resource->write_ordering]
+ );
+ seq_printf(seq, " oos:%llu\n",
+ Bit2KB((unsigned long long)
+ drbd_bm_total_weight(device)));
+ }
+ if (state.conn == C_SYNC_SOURCE ||
+ state.conn == C_SYNC_TARGET ||
+ state.conn == C_VERIFY_S ||
+ state.conn == C_VERIFY_T)
+ drbd_syncer_progress(device, seq, state);
+
+ if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(seq, device->resync);
+ lc_seq_printf_stats(seq, device->act_log);
+ put_ldev(device);
+ }
+
+ if (proc_details >= 2)
+ seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int drbd_proc_open(struct inode *inode, struct file *file)
+{
+ int err;
+
+ if (try_module_get(THIS_MODULE)) {
+ err = single_open(file, drbd_seq_show, NULL);
+ if (err)
+ module_put(THIS_MODULE);
+ return err;
+ }
+ return -ENODEV;
+}
+
+static int drbd_proc_release(struct inode *inode, struct file *file)
+{
+ module_put(THIS_MODULE);
+ return single_release(inode, file);
+}
+
+/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
new file mode 100644
index 000000000..2da9104a3
--- /dev/null
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -0,0 +1,307 @@
+#ifndef __DRBD_PROTOCOL_H
+#define __DRBD_PROTOCOL_H
+
+enum drbd_packet {
+ /* receiver (data socket) */
+ P_DATA = 0x00,
+ P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
+ P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
+ P_BARRIER = 0x03,
+ P_BITMAP = 0x04,
+ P_BECOME_SYNC_TARGET = 0x05,
+ P_BECOME_SYNC_SOURCE = 0x06,
+ P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
+ P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
+ P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
+ P_SYNC_PARAM = 0x0a,
+ P_PROTOCOL = 0x0b,
+ P_UUIDS = 0x0c,
+ P_SIZES = 0x0d,
+ P_STATE = 0x0e,
+ P_SYNC_UUID = 0x0f,
+ P_AUTH_CHALLENGE = 0x10,
+ P_AUTH_RESPONSE = 0x11,
+ P_STATE_CHG_REQ = 0x12,
+
+ /* asender (meta socket */
+ P_PING = 0x13,
+ P_PING_ACK = 0x14,
+ P_RECV_ACK = 0x15, /* Used in protocol B */
+ P_WRITE_ACK = 0x16, /* Used in protocol C */
+ P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+ P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */
+ P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
+ P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
+ P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
+ P_BARRIER_ACK = 0x1c,
+ P_STATE_CHG_REPLY = 0x1d,
+
+ /* "new" commands, no longer fitting into the ordering scheme above */
+
+ P_OV_REQUEST = 0x1e, /* data socket */
+ P_OV_REPLY = 0x1f,
+ P_OV_RESULT = 0x20, /* meta socket */
+ P_CSUM_RS_REQUEST = 0x21, /* data socket */
+ P_RS_IS_IN_SYNC = 0x22, /* meta socket */
+ P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+ P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
+ /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
+ /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
+ P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
+ P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
+ P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+ P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
+ P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
+ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
+ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */
+ /* 0x2e to 0x30 reserved, used in drbd 9 */
+
+ /* REQ_DISCARD. We used "discard" in different contexts before,
+ * which is why I chose TRIM here, to disambiguate. */
+ P_TRIM = 0x31,
+
+ P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+ P_MAX_OPT_CMD = 0x101,
+
+ /* special command ids for handshake */
+
+ P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */
+ P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */
+
+ P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */
+};
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ * (except block_id and barrier fields.
+ * these are pointers to local structs
+ * and have no relevance for the partner,
+ * which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header80 {
+ u32 magic;
+ u16 command;
+ u16 length; /* bytes of data after this header */
+} __packed;
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+ u16 magic; /* use DRBD_MAGIC_BIG here */
+ u16 command;
+ u32 length;
+} __packed;
+
+struct p_header100 {
+ u32 magic;
+ u16 volume;
+ u16 command;
+ u32 length;
+ u32 pad;
+} __packed;
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER 1 /* depricated */
+#define DP_RW_SYNC 2 /* equals REQ_SYNC */
+#define DP_MAY_SET_IN_SYNC 4
+#define DP_UNPLUG 8 /* not used anymore */
+#define DP_FUA 16 /* equals REQ_FUA */
+#define DP_FLUSH 32 /* equals REQ_FLUSH */
+#define DP_DISCARD 64 /* equals REQ_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
+
+struct p_data {
+ u64 sector; /* 64 bits sector number */
+ u64 block_id; /* to identify the request in protocol B&C */
+ u32 seq_num;
+ u32 dp_flags;
+} __packed;
+
+struct p_trim {
+ struct p_data p_data;
+ u32 size; /* == bio->bi_size */
+} __packed;
+
+/*
+ * commands which share a struct:
+ * p_block_ack:
+ * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ * P_SUPERSEDED (proto C, two-primaries conflict detection)
+ * p_block_req:
+ * P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+ u64 sector;
+ u64 block_id;
+ u32 blksize;
+ u32 seq_num;
+} __packed;
+
+struct p_block_req {
+ u64 sector;
+ u64 block_id;
+ u32 blksize;
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ * P_CONNECTION_FEATURES
+ * P_BARRIER
+ * P_BARRIER_ACK
+ * P_SYNC_PARAM
+ * ReportParams
+ */
+
+#define FF_TRIM 1
+
+struct p_connection_features {
+ u32 protocol_min;
+ u32 feature_flags;
+ u32 protocol_max;
+
+ /* should be more than enough for future enhancements
+ * for now, feature_flags and the reserved array shall be zero.
+ */
+
+ u32 _pad;
+ u64 reserved[7];
+} __packed;
+
+struct p_barrier {
+ u32 barrier; /* barrier number _handle_ only */
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+ u32 barrier;
+ u32 set_size;
+} __packed;
+
+struct p_rs_param {
+ u32 resync_rate;
+
+ /* Since protocol version 88 and higher. */
+ char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+ u32 resync_rate;
+ /* protocol version 89: */
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_rs_param_95 {
+ u32 resync_rate;
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+ u32 c_plan_ahead;
+ u32 c_delay_target;
+ u32 c_fill_target;
+ u32 c_max_rate;
+} __packed;
+
+enum drbd_conn_flags {
+ CF_DISCARD_MY_DATA = 1,
+ CF_DRY_RUN = 2,
+};
+
+struct p_protocol {
+ u32 protocol;
+ u32 after_sb_0p;
+ u32 after_sb_1p;
+ u32 after_sb_2p;
+ u32 conn_flags;
+ u32 two_primaries;
+
+ /* Since protocol version 87 and higher. */
+ char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+ u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+ u64 uuid;
+} __packed;
+
+struct p_sizes {
+ u64 d_size; /* size of disk */
+ u64 u_size; /* user requested size */
+ u64 c_size; /* current exported size */
+ u32 max_bio_size; /* Maximal size of a BIO */
+ u16 queue_order_type; /* not yet implemented in DRBD*/
+ u16 dds_flags; /* use enum dds_flags here. */
+} __packed;
+
+struct p_state {
+ u32 state;
+} __packed;
+
+struct p_req_state {
+ u32 mask;
+ u32 val;
+} __packed;
+
+struct p_req_state_reply {
+ u32 retcode;
+} __packed;
+
+struct p_drbd06_param {
+ u64 size;
+ u32 state;
+ u32 blksize;
+ u32 protocol;
+ u32 version;
+ u32 gen_cnt[5];
+ u32 bit_map_gen[5];
+} __packed;
+
+struct p_block_desc {
+ u64 sector;
+ u32 blksize;
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+ /* RLE_VLI_Bytes = 0,
+ * and other bit variants had been defined during
+ * algorithm evaluation. */
+ RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+ /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+ * (encoding & 0x80): polarity (set/unset) of first runlength
+ * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+ * used to pad up to head.length bytes
+ */
+ u8 encoding;
+
+ u8 code[0];
+} __packed;
+
+struct p_delay_probe93 {
+ u32 seq_num; /* sequence number to match the two probe packets */
+ u32 offset; /* usecs the probe got sent after the reference time point */
+} __packed;
+
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
+ */
+#define DRBD_SOCKET_BUFFER_SIZE 4096
+
+#endif /* __DRBD_PROTOCOL_H */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000..cee20354a
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,5661 @@
+/*
+ drbd_receiver.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include "drbd_vli.h"
+
+#define PRO_FEATURES (FF_TRIM)
+
+struct packet_info {
+ enum drbd_packet cmd;
+ unsigned int size;
+ unsigned int vnr;
+ void *data;
+};
+
+enum finish_epoch {
+ FE_STILL_LIVE,
+ FE_DESTROYED,
+ FE_RECYCLED,
+};
+
+static int drbd_do_features(struct drbd_connection *connection);
+static int drbd_do_auth(struct drbd_connection *connection);
+static int drbd_disconnected(struct drbd_peer_device *);
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
+
+
+#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
+
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+ struct page *page;
+ struct page *tmp;
+
+ BUG_ON(!n);
+ BUG_ON(!head);
+
+ page = *head;
+
+ if (!page)
+ return NULL;
+
+ while (page) {
+ tmp = page_chain_next(page);
+ if (--n == 0)
+ break; /* found sufficient pages */
+ if (tmp == NULL)
+ /* insufficient pages, don't use any of them. */
+ return NULL;
+ page = tmp;
+ }
+
+ /* add end of list marker for the returned list */
+ set_page_private(page, 0);
+ /* actual return value, and adjustment of head */
+ page = *head;
+ *head = tmp;
+ return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+ struct page *tmp;
+ int i = 1;
+ while ((tmp = page_chain_next(page)))
+ ++i, page = tmp;
+ if (len)
+ *len = i;
+ return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+ struct page *tmp;
+ int i = 0;
+ page_chain_for_each_safe(page, tmp) {
+ put_page(page);
+ ++i;
+ }
+ return i;
+}
+
+static void page_chain_add(struct page **head,
+ struct page *chain_first, struct page *chain_last)
+{
+#if 1
+ struct page *tmp;
+ tmp = page_chain_tail(chain_first, NULL);
+ BUG_ON(tmp != chain_last);
+#endif
+
+ /* add chain to head */
+ set_page_private(chain_last, (unsigned long)*head);
+ *head = chain_first;
+}
+
+static struct page *__drbd_alloc_pages(struct drbd_device *device,
+ unsigned int number)
+{
+ struct page *page = NULL;
+ struct page *tmp = NULL;
+ unsigned int i = 0;
+
+ /* Yes, testing drbd_pp_vacant outside the lock is racy.
+ * So what. It saves a spin_lock. */
+ if (drbd_pp_vacant >= number) {
+ spin_lock(&drbd_pp_lock);
+ page = page_chain_del(&drbd_pp_pool, number);
+ if (page)
+ drbd_pp_vacant -= number;
+ spin_unlock(&drbd_pp_lock);
+ if (page)
+ return page;
+ }
+
+ /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ for (i = 0; i < number; i++) {
+ tmp = alloc_page(GFP_TRY);
+ if (!tmp)
+ break;
+ set_page_private(tmp, (unsigned long)page);
+ page = tmp;
+ }
+
+ if (i == number)
+ return page;
+
+ /* Not enough pages immediately available this time.
+ * No need to jump around here, drbd_alloc_pages will retry this
+ * function "soon". */
+ if (page) {
+ tmp = page_chain_tail(page, NULL);
+ spin_lock(&drbd_pp_lock);
+ page_chain_add(&drbd_pp_pool, page, tmp);
+ drbd_pp_vacant += i;
+ spin_unlock(&drbd_pp_lock);
+ }
+ return NULL;
+}
+
+static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
+ struct list_head *to_be_freed)
+{
+ struct drbd_peer_request *peer_req, *tmp;
+
+ /* The EEs are always appended to the end of the list. Since
+ they are sent in order over the wire, they have to finish
+ in order. As soon as we see the first not finished we can
+ stop to examine the list... */
+
+ list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
+ if (drbd_peer_req_has_active_page(peer_req))
+ break;
+ list_move(&peer_req->w.list, to_be_freed);
+ }
+}
+
+static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
+{
+ LIST_HEAD(reclaimed);
+ struct drbd_peer_request *peer_req, *t;
+
+ spin_lock_irq(&device->resource->req_lock);
+ reclaim_finished_net_peer_reqs(device, &reclaimed);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(device, peer_req);
+}
+
+/**
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
+ * @device: DRBD device.
+ * @number: number of pages requested
+ * @retry: whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
+ *
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
+ * Returns a page chain linked via page->private.
+ */
+struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
+ bool retry)
+{
+ struct drbd_device *device = peer_device->device;
+ struct page *page = NULL;
+ struct net_conf *nc;
+ DEFINE_WAIT(wait);
+ unsigned int mxb;
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000;
+ rcu_read_unlock();
+
+ if (atomic_read(&device->pp_in_use) < mxb)
+ page = __drbd_alloc_pages(device, number);
+
+ while (page == NULL) {
+ prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+ drbd_kick_lo_and_reclaim_net(device);
+
+ if (atomic_read(&device->pp_in_use) < mxb) {
+ page = __drbd_alloc_pages(device, number);
+ if (page)
+ break;
+ }
+
+ if (!retry)
+ break;
+
+ if (signal_pending(current)) {
+ drbd_warn(device, "drbd_alloc_pages interrupted!\n");
+ break;
+ }
+
+ if (schedule_timeout(HZ/10) == 0)
+ mxb = UINT_MAX;
+ }
+ finish_wait(&drbd_pp_wait, &wait);
+
+ if (page)
+ atomic_add(number, &device->pp_in_use);
+ return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&resource->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
+static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
+{
+ atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
+ int i;
+
+ if (page == NULL)
+ return;
+
+ if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
+ i = page_chain_free(page);
+ else {
+ struct page *tmp;
+ tmp = page_chain_tail(page, &i);
+ spin_lock(&drbd_pp_lock);
+ page_chain_add(&drbd_pp_pool, page, tmp);
+ drbd_pp_vacant += i;
+ spin_unlock(&drbd_pp_lock);
+ }
+ i = atomic_sub_return(i, a);
+ if (i < 0)
+ drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
+ is_net ? "pp_in_use_by_net" : "pp_in_use", i);
+ wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
+ drbd_ee_fix_bhs()
+ drbd_finish_peer_reqs()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+ unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
+ struct page *page = NULL;
+ unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+
+ if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
+ return NULL;
+
+ peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+ if (!peer_req) {
+ if (!(gfp_mask & __GFP_NOWARN))
+ drbd_err(device, "%s: allocation failed\n", __func__);
+ return NULL;
+ }
+
+ if (has_payload && data_size) {
+ page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
+ if (!page)
+ goto fail;
+ }
+
+ memset(peer_req, 0, sizeof(*peer_req));
+ INIT_LIST_HEAD(&peer_req->w.list);
+ drbd_clear_interval(&peer_req->i);
+ peer_req->i.size = data_size;
+ peer_req->i.sector = sector;
+ peer_req->submit_jif = jiffies;
+ peer_req->peer_device = peer_device;
+ peer_req->pages = page;
+ /*
+ * The block_id is opaque to the receiver. It is not endianness
+ * converted, and sent back to the sender unchanged.
+ */
+ peer_req->block_id = id;
+
+ return peer_req;
+
+ fail:
+ mempool_free(peer_req, drbd_ee_mempool);
+ return NULL;
+}
+
+void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
+ int is_net)
+{
+ might_sleep();
+ if (peer_req->flags & EE_HAS_DIGEST)
+ kfree(peer_req->digest);
+ drbd_free_pages(device, peer_req->pages, is_net);
+ D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+ if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+ drbd_al_complete_io(device, &peer_req->i);
+ }
+ mempool_free(peer_req, drbd_ee_mempool);
+}
+
+int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
+{
+ LIST_HEAD(work_list);
+ struct drbd_peer_request *peer_req, *t;
+ int count = 0;
+ int is_net = list == &device->net_ee;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_init(list, &work_list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ __drbd_free_peer_req(device, peer_req, is_net);
+ count++;
+ }
+ return count;
+}
+
+/*
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
+ */
+static int drbd_finish_peer_reqs(struct drbd_device *device)
+{
+ LIST_HEAD(work_list);
+ LIST_HEAD(reclaimed);
+ struct drbd_peer_request *peer_req, *t;
+ int err = 0;
+
+ spin_lock_irq(&device->resource->req_lock);
+ reclaim_finished_net_peer_reqs(device, &reclaimed);
+ list_splice_init(&device->done_ee, &work_list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(device, peer_req);
+
+ /* possible callbacks here:
+ * e_end_block, and e_end_resync_block, e_send_superseded.
+ * all ignore the last argument.
+ */
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ int err2;
+
+ /* list_del not necessary, next/prev members not touched */
+ err2 = peer_req->w.cb(&peer_req->w, !!err);
+ if (!err)
+ err = err2;
+ drbd_free_peer_req(device, peer_req);
+ }
+ wake_up(&device->ee_wait);
+
+ return err;
+}
+
+static void _drbd_wait_ee_list_empty(struct drbd_device *device,
+ struct list_head *head)
+{
+ DEFINE_WAIT(wait);
+
+ /* avoids spin_lock/unlock
+ * and calling prepare_to_wait in the fast path */
+ while (!list_empty(head)) {
+ prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&device->resource->req_lock);
+ io_schedule();
+ finish_wait(&device->ee_wait, &wait);
+ spin_lock_irq(&device->resource->req_lock);
+ }
+}
+
+static void drbd_wait_ee_list_empty(struct drbd_device *device,
+ struct list_head *head)
+{
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_wait_ee_list_empty(device, head);
+ spin_unlock_irq(&device->resource->req_lock);
+}
+
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
+{
+ struct kvec iov = {
+ .iov_base = buf,
+ .iov_len = size,
+ };
+ struct msghdr msg = {
+ .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+ };
+ return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
+}
+
+static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
+{
+ int rv;
+
+ rv = drbd_recv_short(connection->data.socket, buf, size, 0);
+
+ if (rv < 0) {
+ if (rv == -ECONNRESET)
+ drbd_info(connection, "sock was reset by peer\n");
+ else if (rv != -ERESTARTSYS)
+ drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+ } else if (rv == 0) {
+ if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
+
+ t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
+
+ if (t)
+ goto out;
+ }
+ drbd_info(connection, "sock was shut down by peer\n");
+ }
+
+ if (rv != size)
+ conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+
+out:
+ return rv;
+}
+
+static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv(connection, buf, size);
+ if (err != size) {
+ if (err >= 0)
+ err = -EIO;
+ } else
+ err = 0;
+ return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv_all(connection, buf, size);
+ if (err && !signal_pending(current))
+ drbd_warn(connection, "short read (expected size %d)\n", (int)size);
+ return err;
+}
+
+/* quoting tcp(7):
+ * On individual connections, the socket buffer size must be set prior to the
+ * listen(2) or connect(2) calls in order to have it take effect.
+ * This is our wrapper to do so.
+ */
+static void drbd_setbufsize(struct socket *sock, unsigned int snd,
+ unsigned int rcv)
+{
+ /* open coded SO_SNDBUF, SO_RCVBUF */
+ if (snd) {
+ sock->sk->sk_sndbuf = snd;
+ sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ }
+ if (rcv) {
+ sock->sk->sk_rcvbuf = rcv;
+ sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+}
+
+static struct socket *drbd_try_connect(struct drbd_connection *connection)
+{
+ const char *what;
+ struct socket *sock;
+ struct sockaddr_in6 src_in6;
+ struct sockaddr_in6 peer_in6;
+ struct net_conf *nc;
+ int err, peer_addr_len, my_addr_len;
+ int sndbuf_size, rcvbuf_size, connect_int;
+ int disconnect_on_error = 1;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
+ memcpy(&src_in6, &connection->my_addr, my_addr_len);
+
+ if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
+ src_in6.sin6_port = 0;
+ else
+ ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+ peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
+ memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
+
+ what = "sock_create_kern";
+ err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (err < 0) {
+ sock = NULL;
+ goto out;
+ }
+
+ sock->sk->sk_rcvtimeo =
+ sock->sk->sk_sndtimeo = connect_int * HZ;
+ drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
+
+ /* explicitly bind to the configured IP as source IP
+ * for the outgoing connections.
+ * This is needed for multihomed hosts and to be
+ * able to use lo: interfaces for drbd.
+ * Make sure to use 0 as port number, so linux selects
+ * a free one dynamically.
+ */
+ what = "bind before connect";
+ err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
+ if (err < 0)
+ goto out;
+
+ /* connect may fail, peer not yet available.
+ * stay C_WF_CONNECTION, don't go Disconnecting! */
+ disconnect_on_error = 0;
+ what = "connect";
+ err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
+
+out:
+ if (err < 0) {
+ if (sock) {
+ sock_release(sock);
+ sock = NULL;
+ }
+ switch (-err) {
+ /* timeout, busy, signal pending */
+ case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+ case EINTR: case ERESTARTSYS:
+ /* peer not (yet) available, network problem */
+ case ECONNREFUSED: case ENETUNREACH:
+ case EHOSTDOWN: case EHOSTUNREACH:
+ disconnect_on_error = 0;
+ break;
+ default:
+ drbd_err(connection, "%s failed, err = %d\n", what, err);
+ }
+ if (disconnect_on_error)
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+
+ return sock;
+}
+
+struct accept_wait_data {
+ struct drbd_connection *connection;
+ struct socket *s_listen;
+ struct completion door_bell;
+ void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
+{
+ struct accept_wait_data *ad = sk->sk_user_data;
+ void (*state_change)(struct sock *sk);
+
+ state_change = ad->original_sk_state_change;
+ if (sk->sk_state == TCP_ESTABLISHED)
+ complete(&ad->door_bell);
+ state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+ int err, sndbuf_size, rcvbuf_size, my_addr_len;
+ struct sockaddr_in6 my_addr;
+ struct socket *s_listen;
+ struct net_conf *nc;
+ const char *what;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -EIO;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
+ memcpy(&my_addr, &connection->my_addr, my_addr_len);
+
+ what = "sock_create_kern";
+ err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &s_listen);
+ if (err) {
+ s_listen = NULL;
+ goto out;
+ }
+
+ s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
+
+ what = "bind before listen";
+ err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
+ if (err < 0)
+ goto out;
+
+ ad->s_listen = s_listen;
+ write_lock_bh(&s_listen->sk->sk_callback_lock);
+ ad->original_sk_state_change = s_listen->sk->sk_state_change;
+ s_listen->sk->sk_state_change = drbd_incoming_connection;
+ s_listen->sk->sk_user_data = ad;
+ write_unlock_bh(&s_listen->sk->sk_callback_lock);
+
+ what = "listen";
+ err = s_listen->ops->listen(s_listen, 5);
+ if (err < 0)
+ goto out;
+
+ return 0;
+out:
+ if (s_listen)
+ sock_release(s_listen);
+ if (err < 0) {
+ if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+ drbd_err(connection, "%s failed, err = %d\n", what, err);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ }
+
+ return -EIO;
+}
+
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_state_change = ad->original_sk_state_change;
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+ int timeo, connect_int, err = 0;
+ struct socket *s_estab = NULL;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ timeo = connect_int * HZ;
+ /* 28.5% random jitter */
+ timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
+
+ err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+ if (err <= 0)
+ return NULL;
+
+ err = kernel_accept(ad->s_listen, &s_estab, 0);
+ if (err < 0) {
+ if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+ drbd_err(connection, "accept failed, err = %d\n", err);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ }
+
+ if (s_estab)
+ unregister_state_change(s_estab->sk, ad);
+
+ return s_estab;
+}
+
+static int decode_header(struct drbd_connection *, void *, struct packet_info *);
+
+static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd)
+{
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, cmd, 0, NULL, 0);
+}
+
+static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
+{
+ unsigned int header_size = drbd_header_size(connection);
+ struct packet_info pi;
+ struct net_conf *nc;
+ int err;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -EIO;
+ }
+ sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
+ rcu_read_unlock();
+
+ err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
+ if (err != header_size) {
+ if (err >= 0)
+ err = -EIO;
+ return err;
+ }
+ err = decode_header(connection, connection->data.rbuf, &pi);
+ if (err)
+ return err;
+ return pi.cmd;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @sock: pointer to the pointer to the socket.
+ */
+static bool drbd_socket_okay(struct socket **sock)
+{
+ int rr;
+ char tb[4];
+
+ if (!*sock)
+ return false;
+
+ rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+ if (rr > 0 || rr == -EAGAIN) {
+ return true;
+ } else {
+ sock_release(*sock);
+ *sock = NULL;
+ return false;
+ }
+}
+
+static bool connection_established(struct drbd_connection *connection,
+ struct socket **sock1,
+ struct socket **sock2)
+{
+ struct net_conf *nc;
+ int timeout;
+ bool ok;
+
+ if (!*sock1 || !*sock2)
+ return false;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeout);
+
+ ok = drbd_socket_okay(sock1);
+ ok = drbd_socket_okay(sock2) && ok;
+
+ return ok;
+}
+
+/* Gets called if a connection is established, or if a new minor gets created
+ in a connection */
+int drbd_connected(struct drbd_peer_device *peer_device)
+{
+ struct drbd_device *device = peer_device->device;
+ int err;
+
+ atomic_set(&device->packet_seq, 0);
+ device->peer_seq = 0;
+
+ device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
+ &peer_device->connection->cstate_mutex :
+ &device->own_state_mutex;
+
+ err = drbd_send_sync_param(peer_device);
+ if (!err)
+ err = drbd_send_sizes(peer_device, 0, 0);
+ if (!err)
+ err = drbd_send_uuids(peer_device);
+ if (!err)
+ err = drbd_send_current_state(peer_device);
+ clear_bit(USE_DEGR_WFC_T, &device->flags);
+ clear_bit(RESIZE_PENDING, &device->flags);
+ atomic_set(&device->ap_in_flight, 0);
+ mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
+ return err;
+}
+
+/*
+ * return values:
+ * 1 yes, we have a valid connection
+ * 0 oops, did not work out, please try again
+ * -1 peer talks different language,
+ * no point in trying again, please go standalone.
+ * -2 We do not have a network config...
+ */
+static int conn_connect(struct drbd_connection *connection)
+{
+ struct drbd_socket sock, msock;
+ struct drbd_peer_device *peer_device;
+ struct net_conf *nc;
+ int vnr, timeout, h;
+ bool discard_my_data, ok;
+ enum drbd_state_rv rv;
+ struct accept_wait_data ad = {
+ .connection = connection,
+ .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+ };
+
+ clear_bit(DISCONNECT_SENT, &connection->flags);
+ if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
+ return -2;
+
+ mutex_init(&sock.mutex);
+ sock.sbuf = connection->data.sbuf;
+ sock.rbuf = connection->data.rbuf;
+ sock.socket = NULL;
+ mutex_init(&msock.mutex);
+ msock.sbuf = connection->meta.sbuf;
+ msock.rbuf = connection->meta.rbuf;
+ msock.socket = NULL;
+
+ /* Assume that the peer only understands protocol 80 until we know better. */
+ connection->agreed_pro_version = 80;
+
+ if (prepare_listen_socket(connection, &ad))
+ return 0;
+
+ do {
+ struct socket *s;
+
+ s = drbd_try_connect(connection);
+ if (s) {
+ if (!sock.socket) {
+ sock.socket = s;
+ send_first_packet(connection, &sock, P_INITIAL_DATA);
+ } else if (!msock.socket) {
+ clear_bit(RESOLVE_CONFLICTS, &connection->flags);
+ msock.socket = s;
+ send_first_packet(connection, &msock, P_INITIAL_META);
+ } else {
+ drbd_err(connection, "Logic error in conn_connect()\n");
+ goto out_release_sockets;
+ }
+ }
+
+ if (connection_established(connection, &sock.socket, &msock.socket))
+ break;
+
+retry:
+ s = drbd_wait_for_connect(connection, &ad);
+ if (s) {
+ int fp = receive_first_packet(connection, s);
+ drbd_socket_okay(&sock.socket);
+ drbd_socket_okay(&msock.socket);
+ switch (fp) {
+ case P_INITIAL_DATA:
+ if (sock.socket) {
+ drbd_warn(connection, "initial packet S crossed\n");
+ sock_release(sock.socket);
+ sock.socket = s;
+ goto randomize;
+ }
+ sock.socket = s;
+ break;
+ case P_INITIAL_META:
+ set_bit(RESOLVE_CONFLICTS, &connection->flags);
+ if (msock.socket) {
+ drbd_warn(connection, "initial packet M crossed\n");
+ sock_release(msock.socket);
+ msock.socket = s;
+ goto randomize;
+ }
+ msock.socket = s;
+ break;
+ default:
+ drbd_warn(connection, "Error receiving initial packet\n");
+ sock_release(s);
+randomize:
+ if (prandom_u32() & 1)
+ goto retry;
+ }
+ }
+
+ if (connection->cstate <= C_DISCONNECTING)
+ goto out_release_sockets;
+ if (signal_pending(current)) {
+ flush_signals(current);
+ smp_rmb();
+ if (get_t_state(&connection->receiver) == EXITING)
+ goto out_release_sockets;
+ }
+
+ ok = connection_established(connection, &sock.socket, &msock.socket);
+ } while (!ok);
+
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
+
+ sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+
+ sock.socket->sk->sk_allocation = GFP_NOIO;
+ msock.socket->sk->sk_allocation = GFP_NOIO;
+
+ sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+ /* NOT YET ...
+ * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
+ * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ * first set it to the P_CONNECTION_FEATURES timeout,
+ * which we set to 4x the configured ping_timeout. */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+
+ sock.socket->sk->sk_sndtimeo =
+ sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
+
+ msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+ timeout = nc->timeout * HZ / 10;
+ discard_my_data = nc->discard_my_data;
+ rcu_read_unlock();
+
+ msock.socket->sk->sk_sndtimeo = timeout;
+
+ /* we don't want delays.
+ * we use TCP_CORK where appropriate, though */
+ drbd_tcp_nodelay(sock.socket);
+ drbd_tcp_nodelay(msock.socket);
+
+ connection->data.socket = sock.socket;
+ connection->meta.socket = msock.socket;
+ connection->last_received = jiffies;
+
+ h = drbd_do_features(connection);
+ if (h <= 0)
+ return h;
+
+ if (connection->cram_hmac_tfm) {
+ /* drbd_request_state(device, NS(conn, WFAuth)); */
+ switch (drbd_do_auth(connection)) {
+ case -1:
+ drbd_err(connection, "Authentication of peer failed\n");
+ return -1;
+ case 0:
+ drbd_err(connection, "Authentication of peer failed, trying again.\n");
+ return 0;
+ }
+ }
+
+ connection->data.socket->sk->sk_sndtimeo = timeout;
+ connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+ if (drbd_send_protocol(connection) == -EOPNOTSUPP)
+ return -1;
+
+ /* Prevent a race between resync-handshake and
+ * being promoted to Primary.
+ *
+ * Grab and release the state mutex, so we know that any current
+ * drbd_set_role() is finished, and any incoming drbd_set_role
+ * will see the STATE_SENT flag, and wait for it to be cleared.
+ */
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ mutex_lock(peer_device->device->state_mutex);
+
+ set_bit(STATE_SENT, &connection->flags);
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ mutex_unlock(peer_device->device->state_mutex);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+
+ if (discard_my_data)
+ set_bit(DISCARD_MY_DATA, &device->flags);
+ else
+ clear_bit(DISCARD_MY_DATA, &device->flags);
+
+ drbd_connected(peer_device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+ if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
+ clear_bit(STATE_SENT, &connection->flags);
+ return 0;
+ }
+
+ drbd_thread_start(&connection->asender);
+
+ mutex_lock(&connection->resource->conf_update);
+ /* The discard_my_data flag is a single-shot modifier to the next
+ * connection attempt, the handshake of which is now well underway.
+ * No need for rcu style copying of the whole struct
+ * just to clear a single value. */
+ connection->net_conf->discard_my_data = 0;
+ mutex_unlock(&connection->resource->conf_update);
+
+ return h;
+
+out_release_sockets:
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
+ if (sock.socket)
+ sock_release(sock.socket);
+ if (msock.socket)
+ sock_release(msock.socket);
+ return -1;
+}
+
+static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
+{
+ unsigned int header_size = drbd_header_size(connection);
+
+ if (header_size == sizeof(struct p_header100) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+ struct p_header100 *h = header;
+ if (h->pad != 0) {
+ drbd_err(connection, "Header padding is not zero\n");
+ return -EINVAL;
+ }
+ pi->vnr = be16_to_cpu(h->volume);
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ } else if (header_size == sizeof(struct p_header95) &&
+ *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+ struct p_header95 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ pi->vnr = 0;
+ } else if (header_size == sizeof(struct p_header80) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+ struct p_header80 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be16_to_cpu(h->length);
+ pi->vnr = 0;
+ } else {
+ drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
+ be32_to_cpu(*(__be32 *)header),
+ connection->agreed_pro_version);
+ return -EINVAL;
+ }
+ pi->data = header + header_size;
+ return 0;
+}
+
+static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
+{
+ void *buffer = connection->data.rbuf;
+ int err;
+
+ err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
+ if (err)
+ return err;
+
+ err = decode_header(connection, buffer, pi);
+ connection->last_received = jiffies;
+
+ return err;
+}
+
+static void drbd_flush(struct drbd_connection *connection)
+{
+ int rv;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ if (connection->resource->write_ordering >= WO_bdev_flush) {
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ if (!get_ldev(device))
+ continue;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+
+ /* Right now, we have only this one synchronous code path
+ * for flushes between request epochs.
+ * We may want to make those asynchronous,
+ * or at least parallelize the flushes to the volume devices.
+ */
+ device->flush_jif = jiffies;
+ set_bit(FLUSH_PENDING, &device->flags);
+ rv = blkdev_issue_flush(device->ldev->backing_bdev,
+ GFP_NOIO, NULL);
+ clear_bit(FLUSH_PENDING, &device->flags);
+ if (rv) {
+ drbd_info(device, "local disk flush failed with status %d\n", rv);
+ /* would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0
+ * if (rv == -EOPNOTSUPP) */
+ drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
+ }
+ put_ldev(device);
+ kref_put(&device->kref, drbd_destroy_device);
+
+ rcu_read_lock();
+ if (rv)
+ break;
+ }
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @device: DRBD device.
+ * @epoch: Epoch object.
+ * @ev: Epoch event.
+ */
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
+ struct drbd_epoch *epoch,
+ enum epoch_event ev)
+{
+ int epoch_size;
+ struct drbd_epoch *next_epoch;
+ enum finish_epoch rv = FE_STILL_LIVE;
+
+ spin_lock(&connection->epoch_lock);
+ do {
+ next_epoch = NULL;
+
+ epoch_size = atomic_read(&epoch->epoch_size);
+
+ switch (ev & ~EV_CLEANUP) {
+ case EV_PUT:
+ atomic_dec(&epoch->active);
+ break;
+ case EV_GOT_BARRIER_NR:
+ set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+ break;
+ case EV_BECAME_LAST:
+ /* nothing to do*/
+ break;
+ }
+
+ if (epoch_size != 0 &&
+ atomic_read(&epoch->active) == 0 &&
+ (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
+ if (!(ev & EV_CLEANUP)) {
+ spin_unlock(&connection->epoch_lock);
+ drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
+ spin_lock(&connection->epoch_lock);
+ }
+#if 0
+ /* FIXME: dec unacked on connection, once we have
+ * something to count pending connection packets in. */
+ if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+ dec_unacked(epoch->connection);
+#endif
+
+ if (connection->current_epoch != epoch) {
+ next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+ list_del(&epoch->list);
+ ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+ connection->epochs--;
+ kfree(epoch);
+
+ if (rv == FE_STILL_LIVE)
+ rv = FE_DESTROYED;
+ } else {
+ epoch->flags = 0;
+ atomic_set(&epoch->epoch_size, 0);
+ /* atomic_set(&epoch->active, 0); is already zero */
+ if (rv == FE_STILL_LIVE)
+ rv = FE_RECYCLED;
+ }
+ }
+
+ if (!next_epoch)
+ break;
+
+ epoch = next_epoch;
+ } while (1);
+
+ spin_unlock(&connection->epoch_lock);
+
+ return rv;
+}
+
+static enum write_ordering_e
+max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
+{
+ struct disk_conf *dc;
+
+ dc = rcu_dereference(bdev->disk_conf);
+
+ if (wo == WO_bdev_flush && !dc->disk_flushes)
+ wo = WO_drain_io;
+ if (wo == WO_drain_io && !dc->disk_drain)
+ wo = WO_none;
+
+ return wo;
+}
+
+/**
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @connection: DRBD connection.
+ * @wo: Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+ enum write_ordering_e wo)
+{
+ struct drbd_device *device;
+ enum write_ordering_e pwo;
+ int vnr;
+ static char *write_ordering_str[] = {
+ [WO_none] = "none",
+ [WO_drain_io] = "drain",
+ [WO_bdev_flush] = "flush",
+ };
+
+ pwo = resource->write_ordering;
+ if (wo != WO_bdev_flush)
+ wo = min(pwo, wo);
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, vnr) {
+ if (get_ldev(device)) {
+ wo = max_allowed_wo(device->ldev, wo);
+ if (device->ldev == bdev)
+ bdev = NULL;
+ put_ldev(device);
+ }
+ }
+
+ if (bdev)
+ wo = max_allowed_wo(bdev, wo);
+
+ rcu_read_unlock();
+
+ resource->write_ordering = wo;
+ if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+ drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
+}
+
+/**
+ * drbd_submit_peer_request()
+ * @device: DRBD device.
+ * @peer_req: peer request
+ * @rw: flag field, see bio->bi_rw
+ *
+ * May spread the pages to multiple bios,
+ * depending on bio_add_page restrictions.
+ *
+ * Returns 0 if all bios have been submitted,
+ * -ENOMEM if we could not allocate enough bios,
+ * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
+ * single page to an empty bio (which should never happen and likely indicates
+ * that the lower level IO stack is in some way broken). This has been observed
+ * on certain Xen deployments.
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_peer_request(struct drbd_device *device,
+ struct drbd_peer_request *peer_req,
+ const unsigned rw, const int fault_type)
+{
+ struct bio *bios = NULL;
+ struct bio *bio;
+ struct page *page = peer_req->pages;
+ sector_t sector = peer_req->i.sector;
+ unsigned data_size = peer_req->i.size;
+ unsigned n_bios = 0;
+ unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ int err = -ENOMEM;
+
+ if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+ /* wait for all pending IO completions, before we start
+ * zeroing things out. */
+ conn_wait_active_ee_empty(first_peer_device(device)->connection);
+ /* add it to the active list now,
+ * so we can find it to present it in debugfs */
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_SUBMITTED;
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->active_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+ sector, data_size >> 9, GFP_NOIO, false))
+ peer_req->flags |= EE_WAS_ERROR;
+ drbd_endio_write_sec_final(peer_req);
+ return 0;
+ }
+
+ /* Discards don't have any payload.
+ * But the scsi layer still expects a bio_vec it can use internally,
+ * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
+ if (peer_req->flags & EE_IS_TRIM)
+ nr_pages = 1;
+
+ /* In most cases, we will only need one bio. But in case the lower
+ * level restrictions happen to be different at this offset on this
+ * side than those of the sending peer, we may need to submit the
+ * request in more than one bio.
+ *
+ * Plain bio_alloc is good enough here, this is no DRBD internally
+ * generated bio, but a bio allocated on behalf of the peer.
+ */
+next_bio:
+ bio = bio_alloc(GFP_NOIO, nr_pages);
+ if (!bio) {
+ drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
+ goto fail;
+ }
+ /* > peer_req->i.sector, unless this is the first bio */
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = device->ldev->backing_bdev;
+ bio->bi_rw = rw;
+ bio->bi_private = peer_req;
+ bio->bi_end_io = drbd_peer_request_endio;
+
+ bio->bi_next = bios;
+ bios = bio;
+ ++n_bios;
+
+ if (rw & REQ_DISCARD) {
+ bio->bi_iter.bi_size = data_size;
+ goto submit;
+ }
+
+ page_chain_for_each(page) {
+ unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
+ if (!bio_add_page(bio, page, len, 0)) {
+ /* A single page must always be possible!
+ * But in case it fails anyways,
+ * we deal with it, and complain (below). */
+ if (bio->bi_vcnt == 0) {
+ drbd_err(device,
+ "bio_add_page failed for len=%u, "
+ "bi_vcnt=0 (bi_sector=%llu)\n",
+ len, (uint64_t)bio->bi_iter.bi_sector);
+ err = -ENOSPC;
+ goto fail;
+ }
+ goto next_bio;
+ }
+ data_size -= len;
+ sector += len >> 9;
+ --nr_pages;
+ }
+ D_ASSERT(device, data_size == 0);
+submit:
+ D_ASSERT(device, page == NULL);
+
+ atomic_set(&peer_req->pending_bios, n_bios);
+ /* for debugfs: update timestamp, mark as submitted */
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_SUBMITTED;
+ do {
+ bio = bios;
+ bios = bios->bi_next;
+ bio->bi_next = NULL;
+
+ drbd_generic_make_request(device, fault_type, bio);
+ } while (bios);
+ return 0;
+
+fail:
+ while (bios) {
+ bio = bios;
+ bios = bios->bi_next;
+ bio_put(bio);
+ }
+ return err;
+}
+
+static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_interval *i = &peer_req->i;
+
+ drbd_remove_interval(&device->write_requests, i);
+ drbd_clear_interval(i);
+
+ /* Wake up any processes waiting for this peer request to complete. */
+ if (i->waiting)
+ wake_up(&device->misc_wait);
+}
+
+static void conn_wait_active_ee_empty(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_wait_ee_list_empty(device, &device->active_ee);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+ return idr_find(&connection->peer_devices, volume_number);
+}
+
+static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
+{
+ int rv;
+ struct p_barrier *p = pi->data;
+ struct drbd_epoch *epoch;
+
+ /* FIXME these are unacked on connection,
+ * not a specific (peer)device.
+ */
+ connection->current_epoch->barrier_nr = p->barrier;
+ connection->current_epoch->connection = connection;
+ rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
+
+ /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+ * the activity log, which means it would not be resynced in case the
+ * R_PRIMARY crashes now.
+ * Therefore we must send the barrier_ack after the barrier request was
+ * completed. */
+ switch (connection->resource->write_ordering) {
+ case WO_none:
+ if (rv == FE_RECYCLED)
+ return 0;
+
+ /* receiver context, in the writeout path of the other node.
+ * avoid potential distributed deadlock */
+ epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+ if (epoch)
+ break;
+ else
+ drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
+ /* Fall through */
+
+ case WO_bdev_flush:
+ case WO_drain_io:
+ conn_wait_active_ee_empty(connection);
+ drbd_flush(connection);
+
+ if (atomic_read(&connection->current_epoch->epoch_size)) {
+ epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+ if (epoch)
+ break;
+ }
+
+ return 0;
+ default:
+ drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
+ connection->resource->write_ordering);
+ return -EIO;
+ }
+
+ epoch->flags = 0;
+ atomic_set(&epoch->epoch_size, 0);
+ atomic_set(&epoch->active, 0);
+
+ spin_lock(&connection->epoch_lock);
+ if (atomic_read(&connection->current_epoch->epoch_size)) {
+ list_add(&epoch->list, &connection->current_epoch->list);
+ connection->current_epoch = epoch;
+ connection->epochs++;
+ } else {
+ /* The current_epoch got recycled while we allocated this one... */
+ kfree(epoch);
+ }
+ spin_unlock(&connection->epoch_lock);
+
+ return 0;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data */
+static struct drbd_peer_request *
+read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+ struct packet_info *pi) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ struct drbd_peer_request *peer_req;
+ struct page *page;
+ int digest_size, err;
+ unsigned int data_size = pi->size, ds;
+ void *dig_in = peer_device->connection->int_dig_in;
+ void *dig_vv = peer_device->connection->int_dig_vv;
+ unsigned long *data;
+ struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
+
+ digest_size = 0;
+ if (!trim && peer_device->connection->peer_integrity_tfm) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ /*
+ * FIXME: Receive the incoming digest into the receive buffer
+ * here, together with its struct p_data?
+ */
+ err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+ if (err)
+ return NULL;
+ data_size -= digest_size;
+ }
+
+ if (trim) {
+ D_ASSERT(peer_device, data_size == 0);
+ data_size = be32_to_cpu(trim->size);
+ }
+
+ if (!expect(IS_ALIGNED(data_size, 512)))
+ return NULL;
+ /* prepare for larger trim requests. */
+ if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
+ return NULL;
+
+ /* even though we trust out peer,
+ * we sometimes have to double check. */
+ if (sector + (data_size>>9) > capacity) {
+ drbd_err(device, "request from peer beyond end of local disk: "
+ "capacity: %llus < sector: %llus + size: %u\n",
+ (unsigned long long)capacity,
+ (unsigned long long)sector, data_size);
+ return NULL;
+ }
+
+ /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
+ if (!peer_req)
+ return NULL;
+
+ peer_req->flags |= EE_WRITE;
+ if (trim)
+ return peer_req;
+
+ ds = data_size;
+ page = peer_req->pages;
+ page_chain_for_each(page) {
+ unsigned len = min_t(int, ds, PAGE_SIZE);
+ data = kmap(page);
+ err = drbd_recv_all_warn(peer_device->connection, data, len);
+ if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
+ drbd_err(device, "Fault injection: Corrupting data on receive\n");
+ data[0] = data[0] ^ (unsigned long)-1;
+ }
+ kunmap(page);
+ if (err) {
+ drbd_free_peer_req(device, peer_req);
+ return NULL;
+ }
+ ds -= len;
+ }
+
+ if (digest_size) {
+ drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
+ if (memcmp(dig_in, dig_vv, digest_size)) {
+ drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
+ (unsigned long long)sector, data_size);
+ drbd_free_peer_req(device, peer_req);
+ return NULL;
+ }
+ }
+ device->recv_cnt += data_size >> 9;
+ return peer_req;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
+{
+ struct page *page;
+ int err = 0;
+ void *data;
+
+ if (!data_size)
+ return 0;
+
+ page = drbd_alloc_pages(peer_device, 1, 1);
+
+ data = kmap(page);
+ while (data_size) {
+ unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+ err = drbd_recv_all_warn(peer_device->connection, data, len);
+ if (err)
+ break;
+ data_size -= len;
+ }
+ kunmap(page);
+ drbd_free_pages(peer_device->device, page, 0);
+ return err;
+}
+
+static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
+ sector_t sector, int data_size)
+{
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ struct bio *bio;
+ int digest_size, err, expect;
+ void *dig_in = peer_device->connection->int_dig_in;
+ void *dig_vv = peer_device->connection->int_dig_vv;
+
+ digest_size = 0;
+ if (peer_device->connection->peer_integrity_tfm) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+ if (err)
+ return err;
+ data_size -= digest_size;
+ }
+
+ /* optimistically update recv_cnt. if receiving fails below,
+ * we disconnect anyways, and counters will be reset. */
+ peer_device->device->recv_cnt += data_size>>9;
+
+ bio = req->master_bio;
+ D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
+
+ bio_for_each_segment(bvec, bio, iter) {
+ void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
+ expect = min_t(int, data_size, bvec.bv_len);
+ err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
+ kunmap(bvec.bv_page);
+ if (err)
+ return err;
+ data_size -= expect;
+ }
+
+ if (digest_size) {
+ drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
+ if (memcmp(dig_in, dig_vv, digest_size)) {
+ drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
+ return -EINVAL;
+ }
+ }
+
+ D_ASSERT(peer_device->device, data_size == 0);
+ return 0;
+}
+
+/*
+ * e_end_resync_block() is called in asender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ int err;
+
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ drbd_set_in_sync(device, sector, peer_req->i.size);
+ err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
+ } else {
+ /* Record failure to sync */
+ drbd_rs_failed_io(device, sector, peer_req->i.size);
+
+ err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+ }
+ dec_unacked(device);
+
+ return err;
+}
+
+static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
+ struct packet_info *pi) __releases(local)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
+
+ peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
+ if (!peer_req)
+ goto fail;
+
+ dec_rs_pending(device);
+
+ inc_unacked(device);
+ /* corresponding dec_unacked() in e_end_resync_block()
+ * respective _drbd_clear_done_ee */
+
+ peer_req->w.cb = e_end_resync_block;
+ peer_req->submit_jif = jiffies;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->sync_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ atomic_add(pi->size >> 9, &device->rs_sect_ev);
+ if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+ return 0;
+
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ drbd_free_peer_req(device, peer_req);
+fail:
+ put_ldev(device);
+ return -EIO;
+}
+
+static struct drbd_request *
+find_request(struct drbd_device *device, struct rb_root *root, u64 id,
+ sector_t sector, bool missing_ok, const char *func)
+{
+ struct drbd_request *req;
+
+ /* Request object according to our peer */
+ req = (struct drbd_request *)(unsigned long)id;
+ if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+ return req;
+ if (!missing_ok) {
+ drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
+ (unsigned long)id, (unsigned long long)sector);
+ }
+ return NULL;
+}
+
+static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct drbd_request *req;
+ sector_t sector;
+ int err;
+ struct p_data *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+
+ spin_lock_irq(&device->resource->req_lock);
+ req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (unlikely(!req))
+ return -EIO;
+
+ /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
+ * special casing it there for the various failure cases.
+ * still no race with drbd_fail_pending_reads */
+ err = recv_dless_read(peer_device, req, sector, pi->size);
+ if (!err)
+ req_mod(req, DATA_RECEIVED);
+ /* else: nothing. handled from drbd_disconnect...
+ * I don't think we may complete this just yet
+ * in case we are "on-disconnect: freeze" */
+
+ return err;
+}
+
+static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ sector_t sector;
+ int err;
+ struct p_data *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+ D_ASSERT(device, p->block_id == ID_SYNCER);
+
+ if (get_ldev(device)) {
+ /* data is submitted to disk within recv_resync_read.
+ * corresponding put_ldev done below on error,
+ * or in drbd_peer_request_endio. */
+ err = recv_resync_read(peer_device, sector, pi);
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Can not write resync data to local disk.\n");
+
+ err = drbd_drain_block(peer_device, pi->size);
+
+ drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+ }
+
+ atomic_add(pi->size >> 9, &device->rs_sect_in);
+
+ return err;
+}
+
+static void restart_conflicting_writes(struct drbd_device *device,
+ sector_t sector, int size)
+{
+ struct drbd_interval *i;
+ struct drbd_request *req;
+
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED))
+ continue;
+ /* as it is RQ_POSTPONED, this will cause it to
+ * be queued on the retry workqueue. */
+ __req_mod(req, CONFLICT_RESOLVED, NULL);
+ }
+}
+
+/*
+ * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ int err = 0, pcmd;
+
+ if (peer_req->flags & EE_SEND_WRITE_ACK) {
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ pcmd = (device->state.conn >= C_SYNC_SOURCE &&
+ device->state.conn <= C_PAUSED_SYNC_T &&
+ peer_req->flags & EE_MAY_SET_IN_SYNC) ?
+ P_RS_WRITE_ACK : P_WRITE_ACK;
+ err = drbd_send_ack(peer_device, pcmd, peer_req);
+ if (pcmd == P_RS_WRITE_ACK)
+ drbd_set_in_sync(device, sector, peer_req->i.size);
+ } else {
+ err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+ /* we expect it to be marked out of sync anyways...
+ * maybe assert this? */
+ }
+ dec_unacked(device);
+ }
+
+ /* we delete from the conflict detection hash _after_ we sent out the
+ * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
+ if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+ spin_lock_irq(&device->resource->req_lock);
+ D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ if (peer_req->flags & EE_RESTART_REQUESTS)
+ restart_conflicting_writes(device, sector, peer_req->i.size);
+ spin_unlock_irq(&device->resource->req_lock);
+ } else
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+ drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+ return err;
+}
+
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ int err;
+
+ err = drbd_send_ack(peer_device, ack, peer_req);
+ dec_unacked(peer_device->device);
+
+ return err;
+}
+
+static int e_send_superseded(struct drbd_work *w, int unused)
+{
+ return e_send_ack(w, P_SUPERSEDED);
+}
+
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_connection *connection = peer_req->peer_device->connection;
+
+ return e_send_ack(w, connection->agreed_pro_version >= 100 ?
+ P_RETRY_WRITE : P_SUPERSEDED);
+}
+
+static bool seq_greater(u32 a, u32 b)
+{
+ /*
+ * We assume 32-bit wrap-around here.
+ * For 24-bit wrap-around, we would have to shift:
+ * a <<= 8; b <<= 8;
+ */
+ return (s32)a - (s32)b > 0;
+}
+
+static u32 seq_max(u32 a, u32 b)
+{
+ return seq_greater(a, b) ? a : b;
+}
+
+static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
+{
+ struct drbd_device *device = peer_device->device;
+ unsigned int newest_peer_seq;
+
+ if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
+ spin_lock(&device->peer_seq_lock);
+ newest_peer_seq = seq_max(device->peer_seq, peer_seq);
+ device->peer_seq = newest_peer_seq;
+ spin_unlock(&device->peer_seq_lock);
+ /* wake up only if we actually changed device->peer_seq */
+ if (peer_seq == newest_peer_seq)
+ wake_up(&device->seq_wait);
+ }
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+ return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+ struct drbd_peer_request *rs_req;
+ bool rv = 0;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_for_each_entry(rs_req, &device->sync_ee, w.list) {
+ if (overlaps(peer_req->i.sector, peer_req->i.size,
+ rs_req->i.sector, rs_req->i.size)) {
+ rv = 1;
+ break;
+ }
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+
+ return rv;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than device->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update device->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
+{
+ struct drbd_device *device = peer_device->device;
+ DEFINE_WAIT(wait);
+ long timeout;
+ int ret = 0, tp;
+
+ if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
+ return 0;
+
+ spin_lock(&device->peer_seq_lock);
+ for (;;) {
+ if (!seq_greater(peer_seq - 1, device->peer_seq)) {
+ device->peer_seq = seq_max(device->peer_seq, peer_seq);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ rcu_read_lock();
+ tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+ rcu_read_unlock();
+
+ if (!tp)
+ break;
+
+ /* Only need to wait if two_primaries is enabled */
+ prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock(&device->peer_seq_lock);
+ rcu_read_lock();
+ timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
+ rcu_read_unlock();
+ timeout = schedule_timeout(timeout);
+ spin_lock(&device->peer_seq_lock);
+ if (!timeout) {
+ ret = -ETIMEDOUT;
+ drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
+ break;
+ }
+ }
+ spin_unlock(&device->peer_seq_lock);
+ finish_wait(&device->seq_wait, &wait);
+ return ret;
+}
+
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static unsigned long wire_flags_to_bio(u32 dpf)
+{
+ return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ (dpf & DP_FUA ? REQ_FUA : 0) |
+ (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
+ (dpf & DP_DISCARD ? REQ_DISCARD : 0);
+}
+
+static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
+ unsigned int size)
+{
+ struct drbd_interval *i;
+
+ repeat:
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ struct drbd_request *req;
+ struct bio_and_error m;
+
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (!(req->rq_state & RQ_POSTPONED))
+ continue;
+ req->rq_state &= ~RQ_POSTPONED;
+ __req_mod(req, NEG_ACKED, &m);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (m.bio)
+ complete_master_bio(device, &m);
+ spin_lock_irq(&device->resource->req_lock);
+ goto repeat;
+ }
+}
+
+static int handle_write_conflicts(struct drbd_device *device,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_connection *connection = peer_req->peer_device->connection;
+ bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+ sector_t sector = peer_req->i.sector;
+ const unsigned int size = peer_req->i.size;
+ struct drbd_interval *i;
+ bool equal;
+ int err;
+
+ /*
+ * Inserting the peer request into the write_requests tree will prevent
+ * new conflicting local requests from being added.
+ */
+ drbd_insert_interval(&device->write_requests, &peer_req->i);
+
+ repeat:
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ if (i == &peer_req->i)
+ continue;
+ if (i->completed)
+ continue;
+
+ if (!i->local) {
+ /*
+ * Our peer has sent a conflicting remote request; this
+ * should not happen in a two-node setup. Wait for the
+ * earlier peer request to complete.
+ */
+ err = drbd_wait_misc(device, i);
+ if (err)
+ goto out;
+ goto repeat;
+ }
+
+ equal = i->sector == sector && i->size == size;
+ if (resolve_conflicts) {
+ /*
+ * If the peer request is fully contained within the
+ * overlapping request, it can be considered overwritten
+ * and thus superseded; otherwise, it will be retried
+ * once all overlapping requests have completed.
+ */
+ bool superseded = i->sector <= sector && i->sector +
+ (i->size >> 9) >= sector + (size >> 9);
+
+ if (!equal)
+ drbd_alert(device, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u, "
+ "assuming %s came first\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size,
+ superseded ? "local" : "remote");
+
+ peer_req->w.cb = superseded ? e_send_superseded :
+ e_send_retry_write;
+ list_add_tail(&peer_req->w.list, &device->done_ee);
+ wake_asender(connection);
+
+ err = -ENOENT;
+ goto out;
+ } else {
+ struct drbd_request *req =
+ container_of(i, struct drbd_request, i);
+
+ if (!equal)
+ drbd_alert(device, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size);
+
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED)) {
+ /*
+ * Wait for the node with the discard flag to
+ * decide if this request has been superseded
+ * or needs to be retried.
+ * Requests that have been superseded will
+ * disappear from the write_requests tree.
+ *
+ * In addition, wait for the conflicting
+ * request to finish locally before submitting
+ * the conflicting peer request.
+ */
+ err = drbd_wait_misc(device, &req->i);
+ if (err) {
+ _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+ fail_postponed_requests(device, sector, size);
+ goto out;
+ }
+ goto repeat;
+ }
+ /*
+ * Remember to restart the conflicting requests after
+ * the new peer request has completed.
+ */
+ peer_req->flags |= EE_RESTART_REQUESTS;
+ }
+ }
+ err = 0;
+
+ out:
+ if (err)
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ return err;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct net_conf *nc;
+ sector_t sector;
+ struct drbd_peer_request *peer_req;
+ struct p_data *p = pi->data;
+ u32 peer_seq = be32_to_cpu(p->seq_num);
+ int rw = WRITE;
+ u32 dp_flags;
+ int err, tp;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ if (!get_ldev(device)) {
+ int err2;
+
+ err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+ drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+ atomic_inc(&connection->current_epoch->epoch_size);
+ err2 = drbd_drain_block(peer_device, pi->size);
+ if (!err)
+ err = err2;
+ return err;
+ }
+
+ /*
+ * Corresponding put_ldev done either below (on various errors), or in
+ * drbd_peer_request_endio, if we successfully submit the data at the
+ * end of this function.
+ */
+
+ sector = be64_to_cpu(p->sector);
+ peer_req = read_in_block(peer_device, p->block_id, sector, pi);
+ if (!peer_req) {
+ put_ldev(device);
+ return -EIO;
+ }
+
+ peer_req->w.cb = e_end_block;
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_APPLICATION;
+
+ dp_flags = be32_to_cpu(p->dp_flags);
+ rw |= wire_flags_to_bio(dp_flags);
+ if (pi->cmd == P_TRIM) {
+ struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+ peer_req->flags |= EE_IS_TRIM;
+ if (!blk_queue_discard(q))
+ peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+ D_ASSERT(peer_device, peer_req->i.size > 0);
+ D_ASSERT(peer_device, rw & REQ_DISCARD);
+ D_ASSERT(peer_device, peer_req->pages == NULL);
+ } else if (peer_req->pages == NULL) {
+ D_ASSERT(device, peer_req->i.size == 0);
+ D_ASSERT(device, dp_flags & DP_FLUSH);
+ }
+
+ if (dp_flags & DP_MAY_SET_IN_SYNC)
+ peer_req->flags |= EE_MAY_SET_IN_SYNC;
+
+ spin_lock(&connection->epoch_lock);
+ peer_req->epoch = connection->current_epoch;
+ atomic_inc(&peer_req->epoch->epoch_size);
+ atomic_inc(&peer_req->epoch->active);
+ spin_unlock(&connection->epoch_lock);
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+ tp = nc->two_primaries;
+ if (peer_device->connection->agreed_pro_version < 100) {
+ switch (nc->wire_protocol) {
+ case DRBD_PROT_C:
+ dp_flags |= DP_SEND_WRITE_ACK;
+ break;
+ case DRBD_PROT_B:
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (dp_flags & DP_SEND_WRITE_ACK) {
+ peer_req->flags |= EE_SEND_WRITE_ACK;
+ inc_unacked(device);
+ /* corresponding dec_unacked() in e_end_block()
+ * respective _drbd_clear_done_ee */
+ }
+
+ if (dp_flags & DP_SEND_RECEIVE_ACK) {
+ /* I really don't like it that the receiver thread
+ * sends on the msock, but anyways */
+ drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+ }
+
+ if (tp) {
+ /* two primaries implies protocol C */
+ D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
+ peer_req->flags |= EE_IN_INTERVAL_TREE;
+ err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+ if (err)
+ goto out_interrupted;
+ spin_lock_irq(&device->resource->req_lock);
+ err = handle_write_conflicts(device, peer_req);
+ if (err) {
+ spin_unlock_irq(&device->resource->req_lock);
+ if (err == -ENOENT) {
+ put_ldev(device);
+ return 0;
+ }
+ goto out_interrupted;
+ }
+ } else {
+ update_peer_seq(peer_device, peer_seq);
+ spin_lock_irq(&device->resource->req_lock);
+ }
+ /* if we use the zeroout fallback code, we process synchronously
+ * and we wait for all pending requests, respectively wait for
+ * active_ee to become empty in drbd_submit_peer_request();
+ * better not add ourselves here. */
+ if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+ list_add_tail(&peer_req->w.list, &device->active_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (device->state.conn == C_SYNC_TARGET)
+ wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
+
+ if (device->state.pdsk < D_INCONSISTENT) {
+ /* In case we have the only disk of the cluster, */
+ drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
+ peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+ drbd_al_begin_io(device, &peer_req->i);
+ peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+ }
+
+ err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
+ if (!err)
+ return 0;
+
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+ drbd_al_complete_io(device, &peer_req->i);
+ }
+
+out_interrupted:
+ drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
+ put_ldev(device);
+ drbd_free_peer_req(device, peer_req);
+ return err;
+}
+
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+ bool throttle_if_app_is_waiting)
+{
+ struct lc_element *tmp;
+ bool throttle = drbd_rs_c_min_rate_throttle(device);
+
+ if (!throttle || throttle_if_app_is_waiting)
+ return throttle;
+
+ spin_lock_irq(&device->al_lock);
+ tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
+ if (tmp) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_PRIORITY, &bm_ext->flags))
+ throttle = false;
+ /* Do not slow down if app IO is already waiting for this extent,
+ * and our progress is necessary for application IO to complete. */
+ }
+ spin_unlock_irq(&device->al_lock);
+
+ return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
+{
+ struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+ unsigned long db, dt, dbdt;
+ unsigned int c_min_rate;
+ int curr_events;
+
+ rcu_read_lock();
+ c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
+ rcu_read_unlock();
+
+ /* feature disabled? */
+ if (c_min_rate == 0)
+ return false;
+
+ curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]) -
+ atomic_read(&device->rs_sect_ev);
+
+ if (atomic_read(&device->ap_actlog_cnt)
+ || curr_events - device->rs_last_events > 64) {
+ unsigned long rs_left;
+ int i;
+
+ device->rs_last_events = curr_events;
+
+ /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+ * approx. */
+ i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+
+ if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+ rs_left = device->ov_left;
+ else
+ rs_left = drbd_bm_total_weight(device) - device->rs_failed;
+
+ dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = device->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+
+ if (dbdt > c_min_rate)
+ return true;
+ }
+ return false;
+}
+
+static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ sector_t sector;
+ sector_t capacity;
+ struct drbd_peer_request *peer_req;
+ struct digest_info *di = NULL;
+ int size, verb;
+ unsigned int fault_type;
+ struct p_block_req *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+ capacity = drbd_get_capacity(device->this_bdev);
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+ drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+ (unsigned long long)sector, size);
+ return -EINVAL;
+ }
+ if (sector + (size>>9) > capacity) {
+ drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+ (unsigned long long)sector, size);
+ return -EINVAL;
+ }
+
+ if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
+ verb = 1;
+ switch (pi->cmd) {
+ case P_DATA_REQUEST:
+ drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
+ break;
+ case P_RS_DATA_REQUEST:
+ case P_CSUM_RS_REQUEST:
+ case P_OV_REQUEST:
+ drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
+ break;
+ case P_OV_REPLY:
+ verb = 0;
+ dec_rs_pending(device);
+ drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
+ break;
+ default:
+ BUG();
+ }
+ if (verb && __ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Can not satisfy peer's read request, "
+ "no local data.\n");
+
+ /* drain possibly payload */
+ return drbd_drain_block(peer_device, pi->size);
+ }
+
+ /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+ true /* has real payload */, GFP_NOIO);
+ if (!peer_req) {
+ put_ldev(device);
+ return -ENOMEM;
+ }
+
+ switch (pi->cmd) {
+ case P_DATA_REQUEST:
+ peer_req->w.cb = w_e_end_data_req;
+ fault_type = DRBD_FAULT_DT_RD;
+ /* application IO, don't drbd_rs_begin_io */
+ peer_req->flags |= EE_APPLICATION;
+ goto submit;
+
+ case P_RS_DATA_REQUEST:
+ peer_req->w.cb = w_e_end_rsdata_req;
+ fault_type = DRBD_FAULT_RS_RD;
+ /* used in the sector offset progress display */
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ break;
+
+ case P_OV_REPLY:
+ case P_CSUM_RS_REQUEST:
+ fault_type = DRBD_FAULT_RS_RD;
+ di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
+ if (!di)
+ goto out_free_e;
+
+ di->digest_size = pi->size;
+ di->digest = (((char *)di)+sizeof(struct digest_info));
+
+ peer_req->digest = di;
+ peer_req->flags |= EE_HAS_DIGEST;
+
+ if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
+ goto out_free_e;
+
+ if (pi->cmd == P_CSUM_RS_REQUEST) {
+ D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+ peer_req->w.cb = w_e_end_csum_rs_req;
+ /* used in the sector offset progress display */
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ /* remember to report stats in drbd_resync_finished */
+ device->use_csums = true;
+ } else if (pi->cmd == P_OV_REPLY) {
+ /* track progress, we may need to throttle */
+ atomic_add(size >> 9, &device->rs_sect_in);
+ peer_req->w.cb = w_e_end_ov_reply;
+ dec_rs_pending(device);
+ /* drbd_rs_begin_io done when we sent this request,
+ * but accounting still needs to be done. */
+ goto submit_for_resync;
+ }
+ break;
+
+ case P_OV_REQUEST:
+ if (device->ov_start_sector == ~(sector_t)0 &&
+ peer_device->connection->agreed_pro_version >= 90) {
+ unsigned long now = jiffies;
+ int i;
+ device->ov_start_sector = sector;
+ device->ov_position = sector;
+ device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
+ device->rs_total = device->ov_left;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = device->ov_left;
+ device->rs_mark_time[i] = now;
+ }
+ drbd_info(device, "Online Verify start sector: %llu\n",
+ (unsigned long long)sector);
+ }
+ peer_req->w.cb = w_e_end_ov_req;
+ fault_type = DRBD_FAULT_RS_RD;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Throttle, drbd_rs_begin_io and submit should become asynchronous
+ * wrt the receiver, but it is not as straightforward as it may seem.
+ * Various places in the resync start and stop logic assume resync
+ * requests are processed in order, requeuing this on the worker thread
+ * introduces a bunch of new code for synchronization between threads.
+ *
+ * Unlimited throttling before drbd_rs_begin_io may stall the resync
+ * "forever", throttling after drbd_rs_begin_io will lock that extent
+ * for application writes for the same time. For now, just throttle
+ * here, where the rest of the code expects the receiver to sleep for
+ * a while, anyways.
+ */
+
+ /* Throttle before drbd_rs_begin_io, as that locks out application IO;
+ * this defers syncer requests for some time, before letting at least
+ * on request through. The resync controller on the receiving side
+ * will adapt to the incoming rate accordingly.
+ *
+ * We cannot throttle here if remote is Primary/SyncTarget:
+ * we would also throttle its application reads.
+ * In that case, throttling is done on the SyncTarget only.
+ */
+
+ /* Even though this may be a resync request, we do add to "read_ee";
+ * "sync_ee" is only used for resync WRITEs.
+ * Add to list early, so debugfs can find this request
+ * even if we have to sleep below. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ update_receiver_timing_details(connection, drbd_rs_should_slow_down);
+ if (device->state.peer != R_PRIMARY
+ && drbd_rs_should_slow_down(device, sector, false))
+ schedule_timeout_uninterruptible(HZ/10);
+ update_receiver_timing_details(connection, drbd_rs_begin_io);
+ if (drbd_rs_begin_io(device, sector))
+ goto out_free_e;
+
+submit_for_resync:
+ atomic_add(size >> 9, &device->rs_sect_ev);
+
+submit:
+ update_receiver_timing_details(connection, drbd_submit_peer_request);
+ inc_unacked(device);
+ if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
+ return 0;
+
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
+ /* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
+ put_ldev(device);
+ drbd_free_peer_req(device, peer_req);
+ return -EIO;
+}
+
+/**
+ * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries
+ */
+static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ int self, peer, rv = -100;
+ unsigned long ch_self, ch_peer;
+ enum drbd_after_sb_p after_sb_0p;
+
+ self = device->ldev->md.uuid[UI_BITMAP] & 1;
+ peer = device->p_uuid[UI_BITMAP] & 1;
+
+ ch_peer = device->p_uuid[UI_SIZE];
+ ch_self = device->comm_bm_set;
+
+ rcu_read_lock();
+ after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
+ rcu_read_unlock();
+ switch (after_sb_0p) {
+ case ASB_CONSENSUS:
+ case ASB_DISCARD_SECONDARY:
+ case ASB_CALL_HELPER:
+ case ASB_VIOLENTLY:
+ drbd_err(device, "Configuration error.\n");
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_DISCARD_YOUNGER_PRI:
+ if (self == 0 && peer == 1) {
+ rv = -1;
+ break;
+ }
+ if (self == 1 && peer == 0) {
+ rv = 1;
+ break;
+ }
+ /* Else fall through to one of the other strategies... */
+ case ASB_DISCARD_OLDER_PRI:
+ if (self == 0 && peer == 1) {
+ rv = 1;
+ break;
+ }
+ if (self == 1 && peer == 0) {
+ rv = -1;
+ break;
+ }
+ /* Else fall through to one of the other strategies... */
+ drbd_warn(device, "Discard younger/older primary did not find a decision\n"
+ "Using discard-least-changes instead\n");
+ case ASB_DISCARD_ZERO_CHG:
+ if (ch_peer == 0 && ch_self == 0) {
+ rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+ ? -1 : 1;
+ break;
+ } else {
+ if (ch_peer == 0) { rv = 1; break; }
+ if (ch_self == 0) { rv = -1; break; }
+ }
+ if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
+ break;
+ case ASB_DISCARD_LEAST_CHG:
+ if (ch_self < ch_peer)
+ rv = -1;
+ else if (ch_self > ch_peer)
+ rv = 1;
+ else /* ( ch_self == ch_peer ) */
+ /* Well, then use something else. */
+ rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+ ? -1 : 1;
+ break;
+ case ASB_DISCARD_LOCAL:
+ rv = -1;
+ break;
+ case ASB_DISCARD_REMOTE:
+ rv = 1;
+ }
+
+ return rv;
+}
+
+/**
+ * drbd_asb_recover_1p - Recover after split-brain with one remaining primary
+ */
+static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_1p;
+
+ rcu_read_lock();
+ after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
+ rcu_read_unlock();
+ switch (after_sb_1p) {
+ case ASB_DISCARD_YOUNGER_PRI:
+ case ASB_DISCARD_OLDER_PRI:
+ case ASB_DISCARD_LEAST_CHG:
+ case ASB_DISCARD_LOCAL:
+ case ASB_DISCARD_REMOTE:
+ case ASB_DISCARD_ZERO_CHG:
+ drbd_err(device, "Configuration error.\n");
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_CONSENSUS:
+ hg = drbd_asb_recover_0p(peer_device);
+ if (hg == -1 && device->state.role == R_SECONDARY)
+ rv = hg;
+ if (hg == 1 && device->state.role == R_PRIMARY)
+ rv = hg;
+ break;
+ case ASB_VIOLENTLY:
+ rv = drbd_asb_recover_0p(peer_device);
+ break;
+ case ASB_DISCARD_SECONDARY:
+ return device->state.role == R_PRIMARY ? 1 : -1;
+ case ASB_CALL_HELPER:
+ hg = drbd_asb_recover_0p(peer_device);
+ if (hg == -1 && device->state.role == R_PRIMARY) {
+ enum drbd_state_rv rv2;
+
+ /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+ * we might be here in C_WF_REPORT_PARAMS which is transient.
+ * we do not need to wait for the after state change work either. */
+ rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (rv2 != SS_SUCCESS) {
+ drbd_khelper(device, "pri-lost-after-sb");
+ } else {
+ drbd_warn(device, "Successfully gave up primary role.\n");
+ rv = hg;
+ }
+ } else
+ rv = hg;
+ }
+
+ return rv;
+}
+
+/**
+ * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries
+ */
+static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_2p;
+
+ rcu_read_lock();
+ after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
+ rcu_read_unlock();
+ switch (after_sb_2p) {
+ case ASB_DISCARD_YOUNGER_PRI:
+ case ASB_DISCARD_OLDER_PRI:
+ case ASB_DISCARD_LEAST_CHG:
+ case ASB_DISCARD_LOCAL:
+ case ASB_DISCARD_REMOTE:
+ case ASB_CONSENSUS:
+ case ASB_DISCARD_SECONDARY:
+ case ASB_DISCARD_ZERO_CHG:
+ drbd_err(device, "Configuration error.\n");
+ break;
+ case ASB_VIOLENTLY:
+ rv = drbd_asb_recover_0p(peer_device);
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_CALL_HELPER:
+ hg = drbd_asb_recover_0p(peer_device);
+ if (hg == -1) {
+ enum drbd_state_rv rv2;
+
+ /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+ * we might be here in C_WF_REPORT_PARAMS which is transient.
+ * we do not need to wait for the after state change work either. */
+ rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (rv2 != SS_SUCCESS) {
+ drbd_khelper(device, "pri-lost-after-sb");
+ } else {
+ drbd_warn(device, "Successfully gave up primary role.\n");
+ rv = hg;
+ }
+ } else
+ rv = hg;
+ }
+
+ return rv;
+}
+
+static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
+ u64 bits, u64 flags)
+{
+ if (!uuid) {
+ drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
+ return;
+ }
+ drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+ text,
+ (unsigned long long)uuid[UI_CURRENT],
+ (unsigned long long)uuid[UI_BITMAP],
+ (unsigned long long)uuid[UI_HISTORY_START],
+ (unsigned long long)uuid[UI_HISTORY_END],
+ (unsigned long long)bits,
+ (unsigned long long)flags);
+}
+
+/*
+ 100 after split brain try auto recover
+ 2 C_SYNC_SOURCE set BitMap
+ 1 C_SYNC_SOURCE use BitMap
+ 0 no Sync
+ -1 C_SYNC_TARGET use BitMap
+ -2 C_SYNC_TARGET set BitMap
+ -100 after split brain, disconnect
+-1000 unrelated data
+-1091 requires proto 91
+-1096 requires proto 96
+ */
+static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
+{
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+ u64 self, peer;
+ int i, j;
+
+ self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+
+ *rule_nr = 10;
+ if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+ return 0;
+
+ *rule_nr = 20;
+ if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+ peer != UUID_JUST_CREATED)
+ return -2;
+
+ *rule_nr = 30;
+ if (self != UUID_JUST_CREATED &&
+ (peer == UUID_JUST_CREATED || peer == (u64)0))
+ return 2;
+
+ if (self == peer) {
+ int rct, dc; /* roles at crash time */
+
+ if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+ if (connection->agreed_pro_version < 91)
+ return -1091;
+
+ if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+ (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+ drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+ device->ldev->md.uuid[UI_BITMAP] = 0;
+
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+ device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+ *rule_nr = 34;
+ } else {
+ drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
+ *rule_nr = 36;
+ }
+
+ return 1;
+ }
+
+ if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
+
+ if (connection->agreed_pro_version < 91)
+ return -1091;
+
+ if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+ (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+ drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+ device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
+ device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
+ device->p_uuid[UI_BITMAP] = 0UL;
+
+ drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+ *rule_nr = 35;
+ } else {
+ drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
+ *rule_nr = 37;
+ }
+
+ return -1;
+ }
+
+ /* Common power [off|failure] */
+ rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
+ (device->p_uuid[UI_FLAGS] & 2);
+ /* lowest bit is set when we were primary,
+ * next bit (weight 2) is set when peer was primary */
+ *rule_nr = 40;
+
+ switch (rct) {
+ case 0: /* !self_pri && !peer_pri */ return 0;
+ case 1: /* self_pri && !peer_pri */ return 1;
+ case 2: /* !self_pri && peer_pri */ return -1;
+ case 3: /* self_pri && peer_pri */
+ dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+ return dc ? -1 : 1;
+ }
+ }
+
+ *rule_nr = 50;
+ peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+ if (self == peer)
+ return -1;
+
+ *rule_nr = 51;
+ peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
+ if (self == peer) {
+ if (connection->agreed_pro_version < 96 ?
+ (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
+ (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
+ peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
+ /* The last P_SYNC_UUID did not get though. Undo the last start of
+ resync as sync source modifications of the peer's UUIDs. */
+
+ if (connection->agreed_pro_version < 91)
+ return -1091;
+
+ device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
+ device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
+
+ drbd_info(device, "Lost last syncUUID packet, corrected:\n");
+ drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+ return -1;
+ }
+ }
+
+ *rule_nr = 60;
+ self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ peer = device->p_uuid[i] & ~((u64)1);
+ if (self == peer)
+ return -2;
+ }
+
+ *rule_nr = 70;
+ self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+ if (self == peer)
+ return 1;
+
+ *rule_nr = 71;
+ self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+ if (self == peer) {
+ if (connection->agreed_pro_version < 96 ?
+ (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
+ (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
+ self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
+ /* The last P_SYNC_UUID did not get though. Undo the last start of
+ resync as sync source modifications of our UUIDs. */
+
+ if (connection->agreed_pro_version < 91)
+ return -1091;
+
+ __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
+ __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+ drbd_info(device, "Last syncUUID did not get through, corrected:\n");
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+ device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+
+ return 1;
+ }
+ }
+
+
+ *rule_nr = 80;
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ self = device->ldev->md.uuid[i] & ~((u64)1);
+ if (self == peer)
+ return 2;
+ }
+
+ *rule_nr = 90;
+ self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+ peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+ if (self == peer && self != ((u64)0))
+ return 100;
+
+ *rule_nr = 100;
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ self = device->ldev->md.uuid[i] & ~((u64)1);
+ for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+ peer = device->p_uuid[j] & ~((u64)1);
+ if (self == peer)
+ return -100;
+ }
+ }
+
+ return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+ CONN_MASK (-1) on failure.
+ */
+static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
+ enum drbd_role peer_role,
+ enum drbd_disk_state peer_disk) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ enum drbd_conns rv = C_MASK;
+ enum drbd_disk_state mydisk;
+ struct net_conf *nc;
+ int hg, rule_nr, rr_conflict, tentative;
+
+ mydisk = device->state.disk;
+ if (mydisk == D_NEGOTIATING)
+ mydisk = device->new_state_tmp.disk;
+
+ drbd_info(device, "drbd_sync_handshake:\n");
+
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
+ drbd_uuid_dump(device, "peer", device->p_uuid,
+ device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+ hg = drbd_uuid_compare(device, &rule_nr);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+ if (hg == -1000) {
+ drbd_alert(device, "Unrelated data, aborting!\n");
+ return C_MASK;
+ }
+ if (hg < -1000) {
+ drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
+ return C_MASK;
+ }
+
+ if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+ (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
+ int f = (hg == -100) || abs(hg) == 2;
+ hg = mydisk > D_INCONSISTENT ? 1 : -1;
+ if (f)
+ hg = hg*2;
+ drbd_info(device, "Becoming sync %s due to disk states.\n",
+ hg > 0 ? "source" : "target");
+ }
+
+ if (abs(hg) == 100)
+ drbd_khelper(device, "initial-split-brain");
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+
+ if (hg == 100 || (hg == -100 && nc->always_asbp)) {
+ int pcount = (device->state.role == R_PRIMARY)
+ + (peer_role == R_PRIMARY);
+ int forced = (hg == -100);
+
+ switch (pcount) {
+ case 0:
+ hg = drbd_asb_recover_0p(peer_device);
+ break;
+ case 1:
+ hg = drbd_asb_recover_1p(peer_device);
+ break;
+ case 2:
+ hg = drbd_asb_recover_2p(peer_device);
+ break;
+ }
+ if (abs(hg) < 100) {
+ drbd_warn(device, "Split-Brain detected, %d primaries, "
+ "automatically solved. Sync from %s node\n",
+ pcount, (hg < 0) ? "peer" : "this");
+ if (forced) {
+ drbd_warn(device, "Doing a full sync, since"
+ " UUIDs where ambiguous.\n");
+ hg = hg*2;
+ }
+ }
+ }
+
+ if (hg == -100) {
+ if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
+ hg = -1;
+ if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
+ hg = 1;
+
+ if (abs(hg) < 100)
+ drbd_warn(device, "Split-Brain detected, manually solved. "
+ "Sync from %s node\n",
+ (hg < 0) ? "peer" : "this");
+ }
+ rr_conflict = nc->rr_conflict;
+ tentative = nc->tentative;
+ rcu_read_unlock();
+
+ if (hg == -100) {
+ /* FIXME this log message is not correct if we end up here
+ * after an attempted attach on a diskless node.
+ * We just refuse to attach -- well, we drop the "connection"
+ * to that disk, in a way... */
+ drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
+ drbd_khelper(device, "split-brain");
+ return C_MASK;
+ }
+
+ if (hg > 0 && mydisk <= D_INCONSISTENT) {
+ drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
+ return C_MASK;
+ }
+
+ if (hg < 0 && /* by intention we do not use mydisk here. */
+ device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
+ switch (rr_conflict) {
+ case ASB_CALL_HELPER:
+ drbd_khelper(device, "pri-lost");
+ /* fall through */
+ case ASB_DISCONNECT:
+ drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
+ return C_MASK;
+ case ASB_VIOLENTLY:
+ drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
+ "assumption\n");
+ }
+ }
+
+ if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
+ if (hg == 0)
+ drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
+ else
+ drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
+ drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
+ abs(hg) >= 2 ? "full" : "bit-map based");
+ return C_MASK;
+ }
+
+ if (abs(hg) >= 2) {
+ drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
+ BM_LOCKED_SET_ALLOWED))
+ return C_MASK;
+ }
+
+ if (hg > 0) { /* become sync source. */
+ rv = C_WF_BITMAP_S;
+ } else if (hg < 0) { /* become sync target */
+ rv = C_WF_BITMAP_T;
+ } else {
+ rv = C_CONNECTED;
+ if (drbd_bm_total_weight(device)) {
+ drbd_info(device, "No resync, but %lu bits in bitmap!\n",
+ drbd_bm_total_weight(device));
+ }
+ }
+
+ return rv;
+}
+
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
+{
+ /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+ if (peer == ASB_DISCARD_REMOTE)
+ return ASB_DISCARD_LOCAL;
+
+ /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+ if (peer == ASB_DISCARD_LOCAL)
+ return ASB_DISCARD_REMOTE;
+
+ /* everything else is valid if they are equal on both sides. */
+ return peer;
+}
+
+static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_protocol *p = pi->data;
+ enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+ int p_proto, p_discard_my_data, p_two_primaries, cf;
+ struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+ char integrity_alg[SHARED_SECRET_MAX] = "";
+ struct crypto_hash *peer_integrity_tfm = NULL;
+ void *int_dig_in = NULL, *int_dig_vv = NULL;
+
+ p_proto = be32_to_cpu(p->protocol);
+ p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
+ p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
+ p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
+ p_two_primaries = be32_to_cpu(p->two_primaries);
+ cf = be32_to_cpu(p->conn_flags);
+ p_discard_my_data = cf & CF_DISCARD_MY_DATA;
+
+ if (connection->agreed_pro_version >= 87) {
+ int err;
+
+ if (pi->size > sizeof(integrity_alg))
+ return -EIO;
+ err = drbd_recv_all(connection, integrity_alg, pi->size);
+ if (err)
+ return err;
+ integrity_alg[SHARED_SECRET_MAX - 1] = 0;
+ }
+
+ if (pi->cmd != P_PROTOCOL_UPDATE) {
+ clear_bit(CONN_DRY_RUN, &connection->flags);
+
+ if (cf & CF_DRY_RUN)
+ set_bit(CONN_DRY_RUN, &connection->flags);
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+
+ if (p_proto != nc->wire_protocol) {
+ drbd_err(connection, "incompatible %s settings\n", "protocol");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_discard_my_data && nc->discard_my_data) {
+ drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_two_primaries != nc->two_primaries) {
+ drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (strcmp(integrity_alg, nc->integrity_alg)) {
+ drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
+ goto disconnect_rcu_unlock;
+ }
+
+ rcu_read_unlock();
+ }
+
+ if (integrity_alg[0]) {
+ int hash_size;
+
+ /*
+ * We can only change the peer data integrity algorithm
+ * here. Changing our own data integrity algorithm
+ * requires that we send a P_PROTOCOL_UPDATE packet at
+ * the same time; otherwise, the peer has no way to
+ * tell between which packets the algorithm should
+ * change.
+ */
+
+ peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+ if (!peer_integrity_tfm) {
+ drbd_err(connection, "peer data-integrity-alg %s not supported\n",
+ integrity_alg);
+ goto disconnect;
+ }
+
+ hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+ int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+ int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+ if (!(int_dig_in && int_dig_vv)) {
+ drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
+ goto disconnect;
+ }
+ }
+
+ new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ drbd_err(connection, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ mutex_lock(&connection->data.mutex);
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = connection->net_conf;
+ *new_net_conf = *old_net_conf;
+
+ new_net_conf->wire_protocol = p_proto;
+ new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+ new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+ new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+ new_net_conf->two_primaries = p_two_primaries;
+
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+
+ crypto_free_hash(connection->peer_integrity_tfm);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+ connection->peer_integrity_tfm = peer_integrity_tfm;
+ connection->int_dig_in = int_dig_in;
+ connection->int_dig_vv = int_dig_vv;
+
+ if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+ drbd_info(connection, "peer data-integrity-alg: %s\n",
+ integrity_alg[0] ? integrity_alg : "(none)");
+
+ synchronize_rcu();
+ kfree(old_net_conf);
+ return 0;
+
+disconnect_rcu_unlock:
+ rcu_read_unlock();
+disconnect:
+ crypto_free_hash(peer_integrity_tfm);
+ kfree(int_dig_in);
+ kfree(int_dig_vv);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ * ERR_PTR(error) if something goes wrong
+ * or the crypto hash ptr, if it worked out ok. */
+static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
+ const char *alg, const char *name)
+{
+ struct crypto_hash *tfm;
+
+ if (!alg[0])
+ return NULL;
+
+ tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm)) {
+ drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+ alg, name, PTR_ERR(tfm));
+ return tfm;
+ }
+ return tfm;
+}
+
+static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
+{
+ void *buffer = connection->data.rbuf;
+ int size = pi->size;
+
+ while (size) {
+ int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+ s = drbd_recv(connection, buffer, s);
+ if (s <= 0) {
+ if (s < 0)
+ return s;
+ break;
+ }
+ size -= s;
+ }
+ if (size)
+ return -EIO;
+ return 0;
+}
+
+/*
+ * config_unknown_volume - device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet. It will warn and ignore these
+ * commands. Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
+{
+ drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
+ cmdname(pi->cmd), pi->vnr);
+ return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_rs_param_95 *p;
+ unsigned int header_size, data_size, exp_max_sz;
+ struct crypto_hash *verify_tfm = NULL;
+ struct crypto_hash *csums_tfm = NULL;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+ const int apv = connection->agreed_pro_version;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+ int fifo_size = 0;
+ int err;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
+ exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
+ : apv == 88 ? sizeof(struct p_rs_param)
+ + SHARED_SECRET_MAX
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+ if (pi->size > exp_max_sz) {
+ drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+ pi->size, exp_max_sz);
+ return -EIO;
+ }
+
+ if (apv <= 88) {
+ header_size = sizeof(struct p_rs_param);
+ data_size = pi->size - header_size;
+ } else if (apv <= 94) {
+ header_size = sizeof(struct p_rs_param_89);
+ data_size = pi->size - header_size;
+ D_ASSERT(device, data_size == 0);
+ } else {
+ header_size = sizeof(struct p_rs_param_95);
+ data_size = pi->size - header_size;
+ D_ASSERT(device, data_size == 0);
+ }
+
+ /* initialize verify_alg and csums_alg */
+ p = pi->data;
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+ err = drbd_recv_all(peer_device->connection, p, header_size);
+ if (err)
+ return err;
+
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = peer_device->connection->net_conf;
+ if (get_ldev(device)) {
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ put_ldev(device);
+ mutex_unlock(&connection->resource->conf_update);
+ drbd_err(device, "Allocation of new disk_conf failed\n");
+ return -ENOMEM;
+ }
+
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+
+ new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+ }
+
+ if (apv >= 88) {
+ if (apv == 88) {
+ if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+ drbd_err(device, "verify-alg of wrong size, "
+ "peer wants %u, accepting only up to %u byte\n",
+ data_size, SHARED_SECRET_MAX);
+ err = -EIO;
+ goto reconnect;
+ }
+
+ err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
+ if (err)
+ goto reconnect;
+ /* we expect NUL terminated string */
+ /* but just in case someone tries to be evil */
+ D_ASSERT(device, p->verify_alg[data_size-1] == 0);
+ p->verify_alg[data_size-1] = 0;
+
+ } else /* apv >= 89 */ {
+ /* we still expect NUL terminated strings */
+ /* but just in case someone tries to be evil */
+ D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+ D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+ p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+ p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+ }
+
+ if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+ old_net_conf->verify_alg, p->verify_alg);
+ goto disconnect;
+ }
+ verify_tfm = drbd_crypto_alloc_digest_safe(device,
+ p->verify_alg, "verify-alg");
+ if (IS_ERR(verify_tfm)) {
+ verify_tfm = NULL;
+ goto disconnect;
+ }
+ }
+
+ if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+ old_net_conf->csums_alg, p->csums_alg);
+ goto disconnect;
+ }
+ csums_tfm = drbd_crypto_alloc_digest_safe(device,
+ p->csums_alg, "csums-alg");
+ if (IS_ERR(csums_tfm)) {
+ csums_tfm = NULL;
+ goto disconnect;
+ }
+ }
+
+ if (apv > 94 && new_disk_conf) {
+ new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+ new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+ new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != device->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
+ drbd_err(device, "kmalloc of fifo_buffer failed");
+ put_ldev(device);
+ goto disconnect;
+ }
+ }
+ }
+
+ if (verify_tfm || csums_tfm) {
+ new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ drbd_err(device, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ *new_net_conf = *old_net_conf;
+
+ if (verify_tfm) {
+ strcpy(new_net_conf->verify_alg, p->verify_alg);
+ new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+ crypto_free_hash(peer_device->connection->verify_tfm);
+ peer_device->connection->verify_tfm = verify_tfm;
+ drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
+ }
+ if (csums_tfm) {
+ strcpy(new_net_conf->csums_alg, p->csums_alg);
+ new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+ crypto_free_hash(peer_device->connection->csums_tfm);
+ peer_device->connection->csums_tfm = csums_tfm;
+ drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
+ }
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+ }
+ }
+
+ if (new_disk_conf) {
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ put_ldev(device);
+ }
+
+ if (new_plan) {
+ old_plan = device->rs_plan_s;
+ rcu_assign_pointer(device->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&connection->resource->conf_update);
+ synchronize_rcu();
+ if (new_net_conf)
+ kfree(old_net_conf);
+ kfree(old_disk_conf);
+ kfree(old_plan);
+
+ return 0;
+
+reconnect:
+ if (new_disk_conf) {
+ put_ldev(device);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&connection->resource->conf_update);
+ return -EIO;
+
+disconnect:
+ kfree(new_plan);
+ if (new_disk_conf) {
+ put_ldev(device);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&connection->resource->conf_update);
+ /* just for completeness: actually not needed,
+ * as this is not reached if csums_tfm was ok. */
+ crypto_free_hash(csums_tfm);
+ /* but free the verify_tfm again, if csums_tfm did not work out */
+ crypto_free_hash(verify_tfm);
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_device *device,
+ const char *s, sector_t a, sector_t b)
+{
+ sector_t d;
+ if (a == 0 || b == 0)
+ return;
+ d = (a > b) ? (a - b) : (b - a);
+ if (d > (a>>3) || d > (b>>3))
+ drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
+ (unsigned long long)a, (unsigned long long)b);
+}
+
+static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_sizes *p = pi->data;
+ enum determine_dev_size dd = DS_UNCHANGED;
+ sector_t p_size, p_usize, p_csize, my_usize;
+ int ldsc = 0; /* local disk size changed */
+ enum dds_flags ddsf;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
+ p_size = be64_to_cpu(p->d_size);
+ p_usize = be64_to_cpu(p->u_size);
+ p_csize = be64_to_cpu(p->c_size);
+
+ /* just store the peer's disk size for now.
+ * we still need to figure out whether we accept that. */
+ device->p_size = p_size;
+
+ if (get_ldev(device)) {
+ rcu_read_lock();
+ my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+
+ warn_if_differ_considerably(device, "lower level device sizes",
+ p_size, drbd_get_max_capacity(device->ldev));
+ warn_if_differ_considerably(device, "user requested size",
+ p_usize, my_usize);
+
+ /* if this is the first connect, or an otherwise expected
+ * param exchange, choose the minimum */
+ if (device->state.conn == C_WF_REPORT_PARAMS)
+ p_usize = min_not_zero(my_usize, p_usize);
+
+ /* Never shrink a device with usable data during connect.
+ But allow online shrinking if we are connected. */
+ if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
+ drbd_get_capacity(device->this_bdev) &&
+ device->state.disk >= D_OUTDATED &&
+ device->state.conn < C_CONNECTED) {
+ drbd_err(device, "The peer's disk size is too small!\n");
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ put_ldev(device);
+ return -EIO;
+ }
+
+ if (my_usize != p_usize) {
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ drbd_err(device, "Allocation of new disk_conf failed\n");
+ put_ldev(device);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&connection->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = p_usize;
+
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&connection->resource->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+
+ drbd_info(device, "Peer sets u_size to %lu sectors\n",
+ (unsigned long)my_usize);
+ }
+
+ put_ldev(device);
+ }
+
+ device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+ /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+ In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+ drbd_reconsider_max_bio_size(), we can be sure that after
+ drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+
+ ddsf = be16_to_cpu(p->dds_flags);
+ if (get_ldev(device)) {
+ drbd_reconsider_max_bio_size(device, device->ldev);
+ dd = drbd_determine_dev_size(device, ddsf, NULL);
+ put_ldev(device);
+ if (dd == DS_ERROR)
+ return -EIO;
+ drbd_md_sync(device);
+ } else {
+ /*
+ * I am diskless, need to accept the peer's *current* size.
+ * I must NOT accept the peers backing disk size,
+ * it may have been larger than mine all along...
+ *
+ * At this point, the peer knows more about my disk, or at
+ * least about what we last agreed upon, than myself.
+ * So if his c_size is less than his d_size, the most likely
+ * reason is that *my* d_size was smaller last time we checked.
+ *
+ * However, if he sends a zero current size,
+ * take his (user-capped or) backing disk size anyways.
+ */
+ drbd_reconsider_max_bio_size(device, NULL);
+ drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
+ }
+
+ if (get_ldev(device)) {
+ if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
+ device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+ ldsc = 1;
+ }
+
+ put_ldev(device);
+ }
+
+ if (device->state.conn > C_WF_REPORT_PARAMS) {
+ if (be64_to_cpu(p->c_size) !=
+ drbd_get_capacity(device->this_bdev) || ldsc) {
+ /* we have different sizes, probably peer
+ * needs to know my new size... */
+ drbd_send_sizes(peer_device, 0, ddsf);
+ }
+ if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
+ (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
+ if (device->state.pdsk >= D_INCONSISTENT &&
+ device->state.disk >= D_INCONSISTENT) {
+ if (ddsf & DDSF_NO_RESYNC)
+ drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
+ else
+ resync_after_online_grow(device);
+ } else
+ set_bit(RESYNC_AFTER_NEG, &device->flags);
+ }
+ }
+
+ return 0;
+}
+
+static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_uuids *p = pi->data;
+ u64 *p_uuid;
+ int i, updated_uuids = 0;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
+ p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+ if (!p_uuid) {
+ drbd_err(device, "kmalloc of p_uuid failed\n");
+ return false;
+ }
+
+ for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+ p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+ kfree(device->p_uuid);
+ device->p_uuid = p_uuid;
+
+ if (device->state.conn < C_CONNECTED &&
+ device->state.disk < D_INCONSISTENT &&
+ device->state.role == R_PRIMARY &&
+ (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+ drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
+ (unsigned long long)device->ed_uuid);
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+ }
+
+ if (get_ldev(device)) {
+ int skip_initial_sync =
+ device->state.conn == C_CONNECTED &&
+ peer_device->connection->agreed_pro_version >= 90 &&
+ device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+ (p_uuid[UI_FLAGS] & 8);
+ if (skip_initial_sync) {
+ drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
+ drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+ "clear_n_write from receive_uuids",
+ BM_LOCKED_TEST_ALLOWED);
+ _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
+ _drbd_uuid_set(device, UI_BITMAP, 0);
+ _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+ CS_VERBOSE, NULL);
+ drbd_md_sync(device);
+ updated_uuids = 1;
+ }
+ put_ldev(device);
+ } else if (device->state.disk < D_INCONSISTENT &&
+ device->state.role == R_PRIMARY) {
+ /* I am a diskless primary, the peer just created a new current UUID
+ for me. */
+ updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+ }
+
+ /* Before we test for the disk state, we should wait until an eventually
+ ongoing cluster wide state change is finished. That is important if
+ we are primary and are detaching from our disk. We need to see the
+ new disk state... */
+ mutex_lock(device->state_mutex);
+ mutex_unlock(device->state_mutex);
+ if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
+ updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+
+ if (updated_uuids)
+ drbd_print_uuids(device, "receiver updated UUIDs to");
+
+ return 0;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps: The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state ps)
+{
+ union drbd_state ms;
+
+ static enum drbd_conns c_tab[] = {
+ [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
+ [C_CONNECTED] = C_CONNECTED,
+
+ [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+ [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+ [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+ [C_VERIFY_S] = C_VERIFY_T,
+ [C_MASK] = C_MASK,
+ };
+
+ ms.i = ps.i;
+
+ ms.conn = c_tab[ps.conn];
+ ms.peer = ps.role;
+ ms.role = ps.peer;
+ ms.pdsk = ps.disk;
+ ms.disk = ps.pdsk;
+ ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+ return ms;
+}
+
+static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_req_state *p = pi->data;
+ union drbd_state mask, val;
+ enum drbd_state_rv rv;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
+
+ if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
+ mutex_is_locked(device->state_mutex)) {
+ drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
+ return 0;
+ }
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = drbd_change_state(device, CS_VERBOSE, mask, val);
+ drbd_send_sr_reply(peer_device, rv);
+
+ drbd_md_sync(device);
+
+ return 0;
+}
+
+static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_req_state *p = pi->data;
+ union drbd_state mask, val;
+ enum drbd_state_rv rv;
+
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
+
+ if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
+ mutex_is_locked(&connection->cstate_mutex)) {
+ conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
+ return 0;
+ }
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+ conn_send_sr_reply(connection, rv);
+
+ return 0;
+}
+
+static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_state *p = pi->data;
+ union drbd_state os, ns, peer_state;
+ enum drbd_disk_state real_peer_disk;
+ enum chg_state_flags cs_flags;
+ int rv;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
+ peer_state.i = be32_to_cpu(p->state);
+
+ real_peer_disk = peer_state.disk;
+ if (peer_state.disk == D_NEGOTIATING) {
+ real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+ drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+ }
+
+ spin_lock_irq(&device->resource->req_lock);
+ retry:
+ os = ns = drbd_read_state(device);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ /* If some other part of the code (asender thread, timeout)
+ * already decided to close the connection again,
+ * we must not "re-establish" it here. */
+ if (os.conn <= C_TEAR_DOWN)
+ return -ECONNRESET;
+
+ /* If this is the "end of sync" confirmation, usually the peer disk
+ * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+ * set) resync started in PausedSyncT, or if the timing of pause-/
+ * unpause-sync events has been "just right", the peer disk may
+ * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+ */
+ if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+ real_peer_disk == D_UP_TO_DATE &&
+ os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+ /* If we are (becoming) SyncSource, but peer is still in sync
+ * preparation, ignore its uptodate-ness to avoid flapping, it
+ * will change to inconsistent once the peer reaches active
+ * syncing states.
+ * It may have changed syncer-paused flags, however, so we
+ * cannot ignore this completely. */
+ if (peer_state.conn > C_CONNECTED &&
+ peer_state.conn < C_SYNC_SOURCE)
+ real_peer_disk = D_INCONSISTENT;
+
+ /* if peer_state changes to connected at the same time,
+ * it explicitly notifies us that it finished resync.
+ * Maybe we should finish it up, too? */
+ else if (os.conn >= C_SYNC_SOURCE &&
+ peer_state.conn == C_CONNECTED) {
+ if (drbd_bm_total_weight(device) <= device->rs_failed)
+ drbd_resync_finished(device);
+ return 0;
+ }
+ }
+
+ /* explicit verify finished notification, stop sector reached. */
+ if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
+ peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+ return 0;
+ }
+
+ /* peer says his disk is inconsistent, while we think it is uptodate,
+ * and this happens while the peer still thinks we have a sync going on,
+ * but we think we are already done with the sync.
+ * We ignore this to avoid flapping pdsk.
+ * This should not happen, if the peer is a recent version of drbd. */
+ if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+ os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+ real_peer_disk = D_UP_TO_DATE;
+
+ if (ns.conn == C_WF_REPORT_PARAMS)
+ ns.conn = C_CONNECTED;
+
+ if (peer_state.conn == C_AHEAD)
+ ns.conn = C_BEHIND;
+
+ if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+ get_ldev_if_state(device, D_NEGOTIATING)) {
+ int cr; /* consider resync */
+
+ /* if we established a new connection */
+ cr = (os.conn < C_CONNECTED);
+ /* if we had an established connection
+ * and one of the nodes newly attaches a disk */
+ cr |= (os.conn == C_CONNECTED &&
+ (peer_state.disk == D_NEGOTIATING ||
+ os.disk == D_NEGOTIATING));
+ /* if we have both been inconsistent, and the peer has been
+ * forced to be UpToDate with --overwrite-data */
+ cr |= test_bit(CONSIDER_RESYNC, &device->flags);
+ /* if we had been plain connected, and the admin requested to
+ * start a sync by "invalidate" or "invalidate-remote" */
+ cr |= (os.conn == C_CONNECTED &&
+ (peer_state.conn >= C_STARTING_SYNC_S &&
+ peer_state.conn <= C_WF_BITMAP_T));
+
+ if (cr)
+ ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
+
+ put_ldev(device);
+ if (ns.conn == C_MASK) {
+ ns.conn = C_CONNECTED;
+ if (device->state.disk == D_NEGOTIATING) {
+ drbd_force_state(device, NS(disk, D_FAILED));
+ } else if (peer_state.disk == D_NEGOTIATING) {
+ drbd_err(device, "Disk attach process on the peer node was aborted.\n");
+ peer_state.disk = D_DISKLESS;
+ real_peer_disk = D_DISKLESS;
+ } else {
+ if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
+ return -EIO;
+ D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+ }
+ }
+ }
+
+ spin_lock_irq(&device->resource->req_lock);
+ if (os.i != drbd_read_state(device).i)
+ goto retry;
+ clear_bit(CONSIDER_RESYNC, &device->flags);
+ ns.peer = peer_state.role;
+ ns.pdsk = real_peer_disk;
+ ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+ if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+ ns.disk = device->new_state_tmp.disk;
+ cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+ if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ test_bit(NEW_CUR_UUID, &device->flags)) {
+ /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
+ for temporal network outages! */
+ spin_unlock_irq(&device->resource->req_lock);
+ drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+ tl_clear(peer_device->connection);
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+ return -EIO;
+ }
+ rv = _drbd_set_state(device, ns, cs_flags, NULL);
+ ns = drbd_read_state(device);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (rv < SS_SUCCESS) {
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+ }
+
+ if (os.conn > C_WF_REPORT_PARAMS) {
+ if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+ peer_state.disk != D_NEGOTIATING ) {
+ /* we want resync, peer has not yet decided to sync... */
+ /* Nowadays only used when forcing a node into primary role and
+ setting its disk to UpToDate with that */
+ drbd_send_uuids(peer_device);
+ drbd_send_current_state(peer_device);
+ }
+ }
+
+ clear_bit(DISCARD_MY_DATA, &device->flags);
+
+ drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
+
+ return 0;
+}
+
+static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_rs_uuid *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ wait_event(device->misc_wait,
+ device->state.conn == C_WF_SYNC_UUID ||
+ device->state.conn == C_BEHIND ||
+ device->state.conn < C_CONNECTED ||
+ device->state.disk < D_NEGOTIATING);
+
+ /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */
+
+ /* Here the _drbd_uuid_ functions are right, current should
+ _not_ be rotated into the history */
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
+ _drbd_uuid_set(device, UI_BITMAP, 0UL);
+
+ drbd_print_uuids(device, "updated sync uuid");
+ drbd_start_resync(device, C_SYNC_TARGET);
+
+ put_ldev(device);
+ } else
+ drbd_err(device, "Ignoring SyncUUID packet!\n");
+
+ return 0;
+}
+
+/**
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
+ unsigned long *p, struct bm_xfer_ctx *c)
+{
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+ drbd_header_size(peer_device->connection);
+ unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ unsigned int want = num_words * sizeof(*p);
+ int err;
+
+ if (want != size) {
+ drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
+ return -EIO;
+ }
+ if (want == 0)
+ return 0;
+ err = drbd_recv_all(peer_device->connection, p, want);
+ if (err)
+ return err;
+
+ drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
+
+ c->word_offset += num_words;
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+ if (c->bit_offset > c->bm_bits)
+ c->bit_offset = c->bm_bits;
+
+ return 1;
+}
+
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+ return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+ return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+ return (p->encoding >> 4) & 0x7;
+}
+
+/**
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+recv_bm_rle_bits(struct drbd_peer_device *peer_device,
+ struct p_compressed_bm *p,
+ struct bm_xfer_ctx *c,
+ unsigned int len)
+{
+ struct bitstream bs;
+ u64 look_ahead;
+ u64 rl;
+ u64 tmp;
+ unsigned long s = c->bit_offset;
+ unsigned long e;
+ int toggle = dcbp_get_start(p);
+ int have;
+ int bits;
+
+ bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
+
+ bits = bitstream_get_bits(&bs, &look_ahead, 64);
+ if (bits < 0)
+ return -EIO;
+
+ for (have = bits; have > 0; s += rl, toggle = !toggle) {
+ bits = vli_decode_bits(&rl, look_ahead);
+ if (bits <= 0)
+ return -EIO;
+
+ if (toggle) {
+ e = s + rl -1;
+ if (e >= c->bm_bits) {
+ drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+ return -EIO;
+ }
+ _drbd_bm_set_bits(peer_device->device, s, e);
+ }
+
+ if (have < bits) {
+ drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+ have, bits, look_ahead,
+ (unsigned int)(bs.cur.b - p->code),
+ (unsigned int)bs.buf_len);
+ return -EIO;
+ }
+ /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
+ if (likely(bits < 64))
+ look_ahead >>= bits;
+ else
+ look_ahead = 0;
+ have -= bits;
+
+ bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+ if (bits < 0)
+ return -EIO;
+ look_ahead |= tmp << have;
+ have += bits;
+ }
+
+ c->bit_offset = s;
+ bm_xfer_ctx_bit_to_word_offset(c);
+
+ return (s != c->bm_bits);
+}
+
+/**
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+decode_bitmap_c(struct drbd_peer_device *peer_device,
+ struct p_compressed_bm *p,
+ struct bm_xfer_ctx *c,
+ unsigned int len)
+{
+ if (dcbp_get_code(p) == RLE_VLI_Bits)
+ return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
+
+ /* other variants had been implemented for evaluation,
+ * but have been dropped as this one turned out to be "best"
+ * during all our tests. */
+
+ drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+ conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+ return -EIO;
+}
+
+void INFO_bm_xfer_stats(struct drbd_device *device,
+ const char *direction, struct bm_xfer_ctx *c)
+{
+ /* what would it take to transfer it "plaintext" */
+ unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ unsigned int plain =
+ header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+ c->bm_words * sizeof(unsigned long);
+ unsigned int total = c->bytes[0] + c->bytes[1];
+ unsigned int r;
+
+ /* total can not be zero. but just in case: */
+ if (total == 0)
+ return;
+
+ /* don't report if not compressed */
+ if (total >= plain)
+ return;
+
+ /* total < plain. check for overflow, still */
+ r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+ : (1000 * total / plain);
+
+ if (r > 1000)
+ r = 1000;
+
+ r = 1000 - r;
+ drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+ "total %u; compression: %u.%u%%\n",
+ direction,
+ c->bytes[1], c->packets[1],
+ c->bytes[0], c->packets[0],
+ total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+ it does not matter if the process it in 32 bit chunks or 64 bit
+ chunks as long as it is little endian. (Understand it as byte stream,
+ beginning with the lowest byte...) If we would use big endian
+ we would need to process it from the highest address to the lowest,
+ in order to be agnostic to the 32 vs 64 bits issue.
+
+ returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct bm_xfer_ctx c;
+ int err;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
+ /* you are supposed to send additional out-of-sync information
+ * if you actually set bits during this phase */
+
+ c = (struct bm_xfer_ctx) {
+ .bm_bits = drbd_bm_bits(device),
+ .bm_words = drbd_bm_words(device),
+ };
+
+ for(;;) {
+ if (pi->cmd == P_BITMAP)
+ err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
+ else if (pi->cmd == P_COMPRESSED_BITMAP) {
+ /* MAYBE: sanity check that we speak proto >= 90,
+ * and the feature is enabled! */
+ struct p_compressed_bm *p = pi->data;
+
+ if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
+ drbd_err(device, "ReportCBitmap packet too large\n");
+ err = -EIO;
+ goto out;
+ }
+ if (pi->size <= sizeof(*p)) {
+ drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+ err = -EIO;
+ goto out;
+ }
+ err = drbd_recv_all(peer_device->connection, p, pi->size);
+ if (err)
+ goto out;
+ err = decode_bitmap_c(peer_device, p, &c, pi->size);
+ } else {
+ drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+ err = -EIO;
+ goto out;
+ }
+
+ c.packets[pi->cmd == P_BITMAP]++;
+ c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
+
+ if (err <= 0) {
+ if (err < 0)
+ goto out;
+ break;
+ }
+ err = drbd_recv_header(peer_device->connection, pi);
+ if (err)
+ goto out;
+ }
+
+ INFO_bm_xfer_stats(device, "receive", &c);
+
+ if (device->state.conn == C_WF_BITMAP_T) {
+ enum drbd_state_rv rv;
+
+ err = drbd_send_bitmap(device);
+ if (err)
+ goto out;
+ /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+ rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ D_ASSERT(device, rv == SS_SUCCESS);
+ } else if (device->state.conn != C_WF_BITMAP_S) {
+ /* admin may have requested C_DISCONNECTING,
+ * other threads may have noticed network errors */
+ drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
+ drbd_conn_str(device->state.conn));
+ }
+ err = 0;
+
+ out:
+ drbd_bm_unlock(device);
+ if (!err && device->state.conn == C_WF_BITMAP_S)
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ return err;
+}
+
+static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+ drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
+ pi->cmd, pi->size);
+
+ return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
+{
+ /* Make sure we've acked all the TCP data associated
+ * with the data requests being unplugged */
+ drbd_tcp_quickack(connection->data.socket);
+
+ return 0;
+}
+
+static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_desc *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ switch (device->state.conn) {
+ case C_WF_SYNC_UUID:
+ case C_WF_BITMAP_T:
+ case C_BEHIND:
+ break;
+ default:
+ drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
+ drbd_conn_str(device->state.conn));
+ }
+
+ drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+ return 0;
+}
+
+struct data_cmd {
+ int expect_payload;
+ size_t pkt_size;
+ int (*fn)(struct drbd_connection *, struct packet_info *);
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+ [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
+ [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
+ [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+ [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+ [P_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
+ [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
+ [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
+ [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
+ [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
+ [P_STATE] = { 0, sizeof(struct p_state), receive_state },
+ [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
+ [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+ [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
+ [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
+ [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+ [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
+};
+
+static void drbdd(struct drbd_connection *connection)
+{
+ struct packet_info pi;
+ size_t shs; /* sub header size */
+ int err;
+
+ while (get_t_state(&connection->receiver) == RUNNING) {
+ struct data_cmd *cmd;
+
+ drbd_thread_current_set_cpu(&connection->receiver);
+ update_receiver_timing_details(connection, drbd_recv_header);
+ if (drbd_recv_header(connection, &pi))
+ goto err_out;
+
+ cmd = &drbd_cmd_handler[pi.cmd];
+ if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+ drbd_err(connection, "Unexpected data packet %s (0x%04x)",
+ cmdname(pi.cmd), pi.cmd);
+ goto err_out;
+ }
+
+ shs = cmd->pkt_size;
+ if (pi.size > shs && !cmd->expect_payload) {
+ drbd_err(connection, "No payload expected %s l:%d\n",
+ cmdname(pi.cmd), pi.size);
+ goto err_out;
+ }
+
+ if (shs) {
+ update_receiver_timing_details(connection, drbd_recv_all_warn);
+ err = drbd_recv_all_warn(connection, pi.data, shs);
+ if (err)
+ goto err_out;
+ pi.size -= shs;
+ }
+
+ update_receiver_timing_details(connection, cmd->fn);
+ err = cmd->fn(connection, &pi);
+ if (err) {
+ drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
+ cmdname(pi.cmd), err, pi.size);
+ goto err_out;
+ }
+ }
+ return;
+
+ err_out:
+ conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+static void conn_disconnect(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ enum drbd_conns oc;
+ int vnr;
+
+ if (connection->cstate == C_STANDALONE)
+ return;
+
+ /* We are about to start the cleanup after connection loss.
+ * Make sure drbd_make_request knows about that.
+ * Usually we should be in some network failure state already,
+ * but just in case we are not, we fix it up here.
+ */
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+
+ /* asender does not clean up anything. it must not interfere, either */
+ drbd_thread_stop(&connection->asender);
+ drbd_free_sock(connection);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_disconnected(peer_device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ if (!list_empty(&connection->current_epoch->list))
+ drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
+ /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+ atomic_set(&connection->current_epoch->epoch_size, 0);
+ connection->send.seen_any_write_yet = false;
+
+ drbd_info(connection, "Connection closed\n");
+
+ if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
+ conn_try_outdate_peer_async(connection);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ oc = connection->cstate;
+ if (oc >= C_UNCONNECTED)
+ _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ if (oc == C_DISCONNECTING)
+ conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+}
+
+static int drbd_disconnected(struct drbd_peer_device *peer_device)
+{
+ struct drbd_device *device = peer_device->device;
+ unsigned int i;
+
+ /* wait for current activity to cease. */
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_wait_ee_list_empty(device, &device->active_ee);
+ _drbd_wait_ee_list_empty(device, &device->sync_ee);
+ _drbd_wait_ee_list_empty(device, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ /* We do not have data structures that would allow us to
+ * get the rs_pending_cnt down to 0 again.
+ * * On C_SYNC_TARGET we do not have any data structures describing
+ * the pending RSDataRequest's we have sent.
+ * * On C_SYNC_SOURCE there is no data structure that tracks
+ * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+ * And no, it is not the sum of the reference counts in the
+ * resync_LRU. The resync_LRU tracks the whole operation including
+ * the disk-IO, while the rs_pending_cnt only tracks the blocks
+ * on the fly. */
+ drbd_rs_cancel_all(device);
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+ wake_up(&device->misc_wait);
+
+ del_timer_sync(&device->resync_timer);
+ resync_timer_fn((unsigned long)device);
+
+ /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+ * w_make_resync_request etc. which may still be on the worker queue
+ * to be "canceled" */
+ drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+ drbd_finish_peer_reqs(device);
+
+ /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+ might have issued a work again. The one before drbd_finish_peer_reqs() is
+ necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+ drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+ /* need to do it again, drbd_finish_peer_reqs() may have populated it
+ * again via drbd_try_clear_on_disk_bm(). */
+ drbd_rs_cancel_all(device);
+
+ kfree(device->p_uuid);
+ device->p_uuid = NULL;
+
+ if (!drbd_suspended(device))
+ tl_clear(peer_device->connection);
+
+ drbd_md_sync(device);
+
+ /* serialize with bitmap writeout triggered by the state change,
+ * if any. */
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+
+ /* tcp_close and release of sendpage pages can be deferred. I don't
+ * want to use SO_LINGER, because apparently it can be deferred for
+ * more than 20 seconds (longest time I checked).
+ *
+ * Actually we don't care for exactly when the network stack does its
+ * put_page(), but release our reference on these pages right here.
+ */
+ i = drbd_free_peer_reqs(device, &device->net_ee);
+ if (i)
+ drbd_info(device, "net_ee not empty, killed %u entries\n", i);
+ i = atomic_read(&device->pp_in_use_by_net);
+ if (i)
+ drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
+ i = atomic_read(&device->pp_in_use);
+ if (i)
+ drbd_info(device, "pp_in_use = %d, expected 0\n", i);
+
+ D_ASSERT(device, list_empty(&device->read_ee));
+ D_ASSERT(device, list_empty(&device->active_ee));
+ D_ASSERT(device, list_empty(&device->sync_ee));
+ D_ASSERT(device, list_empty(&device->done_ee));
+
+ return 0;
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+static int drbd_send_features(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+ struct p_connection_features *p;
+
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+ memset(p, 0, sizeof(*p));
+ p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+ p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+ p->feature_flags = cpu_to_be32(PRO_FEATURES);
+ return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
+}
+
+/*
+ * return values:
+ * 1 yes, we have a valid connection
+ * 0 oops, did not work out, please try again
+ * -1 peer talks different language,
+ * no point in trying again, please go standalone.
+ */
+static int drbd_do_features(struct drbd_connection *connection)
+{
+ /* ASSERT current == connection->receiver ... */
+ struct p_connection_features *p;
+ const int expect = sizeof(struct p_connection_features);
+ struct packet_info pi;
+ int err;
+
+ err = drbd_send_features(connection);
+ if (err)
+ return 0;
+
+ err = drbd_recv_header(connection, &pi);
+ if (err)
+ return 0;
+
+ if (pi.cmd != P_CONNECTION_FEATURES) {
+ drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ return -1;
+ }
+
+ if (pi.size != expect) {
+ drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
+ expect, pi.size);
+ return -1;
+ }
+
+ p = pi.data;
+ err = drbd_recv_all_warn(connection, p, expect);
+ if (err)
+ return 0;
+
+ p->protocol_min = be32_to_cpu(p->protocol_min);
+ p->protocol_max = be32_to_cpu(p->protocol_max);
+ if (p->protocol_max == 0)
+ p->protocol_max = p->protocol_min;
+
+ if (PRO_VERSION_MAX < p->protocol_min ||
+ PRO_VERSION_MIN > p->protocol_max)
+ goto incompat;
+
+ connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+ connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
+
+ drbd_info(connection, "Handshake successful: "
+ "Agreed network protocol version %d\n", connection->agreed_pro_version);
+
+ drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
+ connection->agreed_features & FF_TRIM ? " " : " not ");
+
+ return 1;
+
+ incompat:
+ drbd_err(connection, "incompatible DRBD dialects: "
+ "I support %d-%d, peer supports %d-%d\n",
+ PRO_VERSION_MIN, PRO_VERSION_MAX,
+ p->protocol_min, p->protocol_max);
+ return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+ drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+ drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+ return -1;
+}
+#else
+#define CHALLENGE_LEN 64
+
+/* Return value:
+ 1 - auth succeeded,
+ 0 - failed, try again (network error),
+ -1 - auth failed, don't try again.
+*/
+
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+ char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
+ struct scatterlist sg;
+ char *response = NULL;
+ char *right_response = NULL;
+ char *peers_ch = NULL;
+ unsigned int key_len;
+ char secret[SHARED_SECRET_MAX]; /* 64 byte */
+ unsigned int resp_size;
+ struct hash_desc desc;
+ struct packet_info pi;
+ struct net_conf *nc;
+ int err, rv;
+
+ /* FIXME: Put the challenge/response into the preallocated socket buffer. */
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ key_len = strlen(nc->shared_secret);
+ memcpy(secret, nc->shared_secret, key_len);
+ rcu_read_unlock();
+
+ desc.tfm = connection->cram_hmac_tfm;
+ desc.flags = 0;
+
+ rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
+ if (rv) {
+ drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv);
+ rv = -1;
+ goto fail;
+ }
+
+ get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+ sock = &connection->data;
+ if (!conn_prepare_command(connection, sock)) {
+ rv = 0;
+ goto fail;
+ }
+ rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
+ my_challenge, CHALLENGE_LEN);
+ if (!rv)
+ goto fail;
+
+ err = drbd_recv_header(connection, &pi);
+ if (err) {
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.cmd != P_AUTH_CHALLENGE) {
+ drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.size > CHALLENGE_LEN * 2) {
+ drbd_err(connection, "expected AuthChallenge payload too big.\n");
+ rv = -1;
+ goto fail;
+ }
+
+ if (pi.size < CHALLENGE_LEN) {
+ drbd_err(connection, "AuthChallenge payload too small.\n");
+ rv = -1;
+ goto fail;
+ }
+
+ peers_ch = kmalloc(pi.size, GFP_NOIO);
+ if (peers_ch == NULL) {
+ drbd_err(connection, "kmalloc of peers_ch failed\n");
+ rv = -1;
+ goto fail;
+ }
+
+ err = drbd_recv_all_warn(connection, peers_ch, pi.size);
+ if (err) {
+ rv = 0;
+ goto fail;
+ }
+
+ if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+ drbd_err(connection, "Peer presented the same challenge!\n");
+ rv = -1;
+ goto fail;
+ }
+
+ resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
+ response = kmalloc(resp_size, GFP_NOIO);
+ if (response == NULL) {
+ drbd_err(connection, "kmalloc of response failed\n");
+ rv = -1;
+ goto fail;
+ }
+
+ sg_init_table(&sg, 1);
+ sg_set_buf(&sg, peers_ch, pi.size);
+
+ rv = crypto_hash_digest(&desc, &sg, sg.length, response);
+ if (rv) {
+ drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+ rv = -1;
+ goto fail;
+ }
+
+ if (!conn_prepare_command(connection, sock)) {
+ rv = 0;
+ goto fail;
+ }
+ rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
+ response, resp_size);
+ if (!rv)
+ goto fail;
+
+ err = drbd_recv_header(connection, &pi);
+ if (err) {
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.cmd != P_AUTH_RESPONSE) {
+ drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.size != resp_size) {
+ drbd_err(connection, "expected AuthResponse payload of wrong size\n");
+ rv = 0;
+ goto fail;
+ }
+
+ err = drbd_recv_all_warn(connection, response , resp_size);
+ if (err) {
+ rv = 0;
+ goto fail;
+ }
+
+ right_response = kmalloc(resp_size, GFP_NOIO);
+ if (right_response == NULL) {
+ drbd_err(connection, "kmalloc of right_response failed\n");
+ rv = -1;
+ goto fail;
+ }
+
+ sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
+
+ rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
+ if (rv) {
+ drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+ rv = -1;
+ goto fail;
+ }
+
+ rv = !memcmp(response, right_response, resp_size);
+
+ if (rv)
+ drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
+ resp_size);
+ else
+ rv = -1;
+
+ fail:
+ kfree(peers_ch);
+ kfree(response);
+ kfree(right_response);
+
+ return rv;
+}
+#endif
+
+int drbd_receiver(struct drbd_thread *thi)
+{
+ struct drbd_connection *connection = thi->connection;
+ int h;
+
+ drbd_info(connection, "receiver (re)started\n");
+
+ do {
+ h = conn_connect(connection);
+ if (h == 0) {
+ conn_disconnect(connection);
+ schedule_timeout_interruptible(HZ);
+ }
+ if (h == -1) {
+ drbd_warn(connection, "Discarding network configuration.\n");
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ } while (h == 0);
+
+ if (h > 0)
+ drbdd(connection);
+
+ conn_disconnect(connection);
+
+ drbd_info(connection, "receiver terminated\n");
+ return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_req_state_reply *p = pi->data;
+ int retcode = be32_to_cpu(p->retcode);
+
+ if (retcode >= SS_SUCCESS) {
+ set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
+ } else {
+ set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
+ drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
+ drbd_set_st_err_str(retcode), retcode);
+ }
+ wake_up(&connection->ping_wait);
+
+ return 0;
+}
+
+static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_req_state_reply *p = pi->data;
+ int retcode = be32_to_cpu(p->retcode);
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
+ D_ASSERT(device, connection->agreed_pro_version < 100);
+ return got_conn_RqSReply(connection, pi);
+ }
+
+ if (retcode >= SS_SUCCESS) {
+ set_bit(CL_ST_CHG_SUCCESS, &device->flags);
+ } else {
+ set_bit(CL_ST_CHG_FAIL, &device->flags);
+ drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
+ drbd_set_st_err_str(retcode), retcode);
+ }
+ wake_up(&device->state_wait);
+
+ return 0;
+}
+
+static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
+{
+ return drbd_send_ping_ack(connection);
+
+}
+
+static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+ /* restore idle timeout */
+ connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
+ if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
+ wake_up(&connection->ping_wait);
+
+ return 0;
+}
+
+static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ sector_t sector = be64_to_cpu(p->sector);
+ int blksize = be32_to_cpu(p->blksize);
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, sector);
+ drbd_set_in_sync(device, sector, blksize);
+ /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+ device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ put_ldev(device);
+ }
+ dec_rs_pending(device);
+ atomic_add(blksize >> 9, &device->rs_sect_in);
+
+ return 0;
+}
+
+static int
+validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
+ struct rb_root *root, const char *func,
+ enum drbd_req_event what, bool missing_ok)
+{
+ struct drbd_request *req;
+ struct bio_and_error m;
+
+ spin_lock_irq(&device->resource->req_lock);
+ req = find_request(device, root, id, sector, missing_ok, func);
+ if (unlikely(!req)) {
+ spin_unlock_irq(&device->resource->req_lock);
+ return -EIO;
+ }
+ __req_mod(req, what, &m);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (m.bio)
+ complete_master_bio(device, &m);
+ return 0;
+}
+
+static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ sector_t sector = be64_to_cpu(p->sector);
+ int blksize = be32_to_cpu(p->blksize);
+ enum drbd_req_event what;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (p->block_id == ID_SYNCER) {
+ drbd_set_in_sync(device, sector, blksize);
+ dec_rs_pending(device);
+ return 0;
+ }
+ switch (pi->cmd) {
+ case P_RS_WRITE_ACK:
+ what = WRITE_ACKED_BY_PEER_AND_SIS;
+ break;
+ case P_WRITE_ACK:
+ what = WRITE_ACKED_BY_PEER;
+ break;
+ case P_RECV_ACK:
+ what = RECV_ACKED_BY_PEER;
+ break;
+ case P_SUPERSEDED:
+ what = CONFLICT_RESOLVED;
+ break;
+ case P_RETRY_WRITE:
+ what = POSTPONE_WRITE;
+ break;
+ default:
+ BUG();
+ }
+
+ return validate_req_change_req_state(device, p->block_id, sector,
+ &device->write_requests, __func__,
+ what, false);
+}
+
+static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ sector_t sector = be64_to_cpu(p->sector);
+ int size = be32_to_cpu(p->blksize);
+ int err;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (p->block_id == ID_SYNCER) {
+ dec_rs_pending(device);
+ drbd_rs_failed_io(device, sector, size);
+ return 0;
+ }
+
+ err = validate_req_change_req_state(device, p->block_id, sector,
+ &device->write_requests, __func__,
+ NEG_ACKED, true);
+ if (err) {
+ /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+ The master bio might already be completed, therefore the
+ request is no longer in the collision hash. */
+ /* In Protocol B we might already have got a P_RECV_ACK
+ but then get a P_NEG_ACK afterwards. */
+ drbd_set_out_of_sync(device, sector, size);
+ }
+ return 0;
+}
+
+static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ sector_t sector = be64_to_cpu(p->sector);
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
+ (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+ return validate_req_change_req_state(device, p->block_id, sector,
+ &device->read_requests, __func__,
+ NEG_ACKED, false);
+}
+
+static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ sector_t sector;
+ int size;
+ struct p_block_ack *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ dec_rs_pending(device);
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_rs_complete_io(device, sector);
+ switch (pi->cmd) {
+ case P_NEG_RS_DREPLY:
+ drbd_rs_failed_io(device, sector, size);
+ case P_RS_CANCEL:
+ break;
+ default:
+ BUG();
+ }
+ put_ldev(device);
+ }
+
+ return 0;
+}
+
+static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_barrier_ack *p = pi->data;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ if (device->state.conn == C_AHEAD &&
+ atomic_read(&device->ap_in_flight) == 0 &&
+ !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
+ device->start_resync_timer.expires = jiffies + HZ;
+ add_timer(&device->start_resync_timer);
+ }
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ struct drbd_device_work *dw;
+ sector_t sector;
+ int size;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+ drbd_ov_out_of_sync_found(device, sector, size);
+ else
+ ov_out_of_sync_print(device);
+
+ if (!get_ldev(device))
+ return 0;
+
+ drbd_rs_complete_io(device, sector);
+ dec_rs_pending(device);
+
+ --device->ov_left;
+
+ /* let's advance progress step marks only for every other megabyte */
+ if ((device->ov_left & 0x200) == 0x200)
+ drbd_advance_rs_marks(device, device->ov_left);
+
+ if (device->ov_left == 0) {
+ dw = kmalloc(sizeof(*dw), GFP_NOIO);
+ if (dw) {
+ dw->w.cb = w_ov_finished;
+ dw->device = device;
+ drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
+ } else {
+ drbd_err(device, "kmalloc(dw) failed.");
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+ }
+ }
+ put_ldev(device);
+ return 0;
+}
+
+static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+ return 0;
+}
+
+static int connection_finish_peer_reqs(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr, not_empty = 0;
+
+ do {
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
+ flush_signals(current);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ if (drbd_finish_peer_reqs(device)) {
+ kref_put(&device->kref, drbd_destroy_device);
+ return 1;
+ }
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ set_bit(SIGNAL_ASENDER, &connection->flags);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ not_empty = !list_empty(&device->done_ee);
+ if (not_empty)
+ break;
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+ rcu_read_unlock();
+ } while (not_empty);
+
+ return 0;
+}
+
+struct asender_cmd {
+ size_t pkt_size;
+ int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
+static struct asender_cmd asender_tbl[] = {
+ [P_PING] = { 0, got_Ping },
+ [P_PING_ACK] = { 0, got_PingAck },
+ [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
+ [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
+ [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
+ [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
+ [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
+ [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+ [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
+ [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
+ [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
+ [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+ [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
+};
+
+int drbd_asender(struct drbd_thread *thi)
+{
+ struct drbd_connection *connection = thi->connection;
+ struct asender_cmd *cmd = NULL;
+ struct packet_info pi;
+ int rv;
+ void *buf = connection->meta.rbuf;
+ int received = 0;
+ unsigned int header_size = drbd_header_size(connection);
+ int expect = header_size;
+ bool ping_timeout_active = false;
+ struct net_conf *nc;
+ int ping_timeo, tcp_cork, ping_int;
+ struct sched_param param = { .sched_priority = 2 };
+
+ rv = sched_setscheduler(current, SCHED_RR, &param);
+ if (rv < 0)
+ drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ ping_timeo = nc->ping_timeo;
+ tcp_cork = nc->tcp_cork;
+ ping_int = nc->ping_int;
+ rcu_read_unlock();
+
+ if (test_and_clear_bit(SEND_PING, &connection->flags)) {
+ if (drbd_send_ping(connection)) {
+ drbd_err(connection, "drbd_send_ping has failed\n");
+ goto reconnect;
+ }
+ connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+ ping_timeout_active = true;
+ }
+
+ /* TODO: conditionally cork; it may hurt latency if we cork without
+ much to send */
+ if (tcp_cork)
+ drbd_tcp_cork(connection->meta.socket);
+ if (connection_finish_peer_reqs(connection)) {
+ drbd_err(connection, "connection_finish_peer_reqs() failed\n");
+ goto reconnect;
+ }
+ /* but unconditionally uncork unless disabled */
+ if (tcp_cork)
+ drbd_tcp_uncork(connection->meta.socket);
+
+ /* short circuit, recv_msg would return EINTR anyways. */
+ if (signal_pending(current))
+ continue;
+
+ rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
+
+ flush_signals(current);
+
+ /* Note:
+ * -EINTR (on meta) we got a signal
+ * -EAGAIN (on meta) rcvtimeo expired
+ * -ECONNRESET other side closed the connection
+ * -ERESTARTSYS (on data) we got a signal
+ * rv < 0 other than above: unexpected error!
+ * rv == expected: full header or command
+ * rv < expected: "woken" by signal during receive
+ * rv == 0 : "connection shut down by peer"
+ */
+received_more:
+ if (likely(rv > 0)) {
+ received += rv;
+ buf += rv;
+ } else if (rv == 0) {
+ if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
+
+ t = wait_event_timeout(connection->ping_wait,
+ connection->cstate < C_WF_REPORT_PARAMS,
+ t);
+ if (t)
+ break;
+ }
+ drbd_err(connection, "meta connection shut down by peer.\n");
+ goto reconnect;
+ } else if (rv == -EAGAIN) {
+ /* If the data socket received something meanwhile,
+ * that is good enough: peer is still alive. */
+ if (time_after(connection->last_received,
+ jiffies - connection->meta.socket->sk->sk_rcvtimeo))
+ continue;
+ if (ping_timeout_active) {
+ drbd_err(connection, "PingAck did not arrive in time.\n");
+ goto reconnect;
+ }
+ set_bit(SEND_PING, &connection->flags);
+ continue;
+ } else if (rv == -EINTR) {
+ continue;
+ } else {
+ drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+ goto reconnect;
+ }
+
+ if (received == expect && cmd == NULL) {
+ if (decode_header(connection, connection->meta.rbuf, &pi))
+ goto reconnect;
+ cmd = &asender_tbl[pi.cmd];
+ if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+ drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ goto disconnect;
+ }
+ expect = header_size + cmd->pkt_size;
+ if (pi.size != expect - header_size) {
+ drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
+ pi.cmd, pi.size);
+ goto reconnect;
+ }
+ }
+ if (received == expect) {
+ bool err;
+
+ err = cmd->fn(connection, &pi);
+ if (err) {
+ drbd_err(connection, "%pf failed\n", cmd->fn);
+ goto reconnect;
+ }
+
+ connection->last_received = jiffies;
+
+ if (cmd == &asender_tbl[P_PING_ACK]) {
+ /* restore idle timeout */
+ connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+ ping_timeout_active = false;
+ }
+
+ buf = connection->meta.rbuf;
+ received = 0;
+ expect = header_size;
+ cmd = NULL;
+ }
+ if (test_bit(SEND_PING, &connection->flags))
+ continue;
+ rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
+ if (rv > 0)
+ goto received_more;
+ }
+
+ if (0) {
+reconnect:
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+ conn_md_sync(connection);
+ }
+ if (0) {
+disconnect:
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
+
+ drbd_info(connection, "asender terminated\n");
+
+ return 0;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000..3907202fb
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1638 @@
+/*
+ drbd_req.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size);
+
+/* Update disk stats at start of I/O request */
+static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req)
+{
+ generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9,
+ &device->vdisk->part0);
+}
+
+/* Update disk stats when completing request upwards */
+static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
+{
+ generic_end_io_acct(bio_data_dir(req->master_bio),
+ &device->vdisk->part0, req->start_jif);
+}
+
+static struct drbd_request *drbd_req_new(struct drbd_device *device,
+ struct bio *bio_src)
+{
+ struct drbd_request *req;
+
+ req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+ if (!req)
+ return NULL;
+ memset(req, 0, sizeof(*req));
+
+ drbd_req_make_private_bio(req, bio_src);
+ req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+ req->device = device;
+ req->master_bio = bio_src;
+ req->epoch = 0;
+
+ drbd_clear_interval(&req->i);
+ req->i.sector = bio_src->bi_iter.bi_sector;
+ req->i.size = bio_src->bi_iter.bi_size;
+ req->i.local = true;
+ req->i.waiting = false;
+
+ INIT_LIST_HEAD(&req->tl_requests);
+ INIT_LIST_HEAD(&req->w.list);
+ INIT_LIST_HEAD(&req->req_pending_master_completion);
+ INIT_LIST_HEAD(&req->req_pending_local);
+
+ /* one reference to be put by __drbd_make_request */
+ atomic_set(&req->completion_ref, 1);
+ /* one kref as long as completion_ref > 0 */
+ kref_init(&req->kref);
+ return req;
+}
+
+static void drbd_remove_request_interval(struct rb_root *root,
+ struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ struct drbd_interval *i = &req->i;
+
+ drbd_remove_interval(root, i);
+
+ /* Wake up any processes waiting for this request to complete. */
+ if (i->waiting)
+ wake_up(&device->misc_wait);
+}
+
+void drbd_req_destroy(struct kref *kref)
+{
+ struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+ struct drbd_device *device = req->device;
+ const unsigned s = req->rq_state;
+
+ if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+ atomic_read(&req->completion_ref) ||
+ (s & RQ_LOCAL_PENDING) ||
+ ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+ drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+ s, atomic_read(&req->completion_ref));
+ return;
+ }
+
+ /* If called from mod_rq_state (expected normal case) or
+ * drbd_send_and_submit (the less likely normal path), this holds the
+ * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+ * though it may be still empty (never added to the transfer log).
+ *
+ * If called from do_retry(), we do NOT hold the req_lock, but we are
+ * still allowed to unconditionally list_del(&req->tl_requests),
+ * because it will be on a local on-stack list only. */
+ list_del_init(&req->tl_requests);
+
+ /* finally remove the request from the conflict detection
+ * respective block_id verification interval tree. */
+ if (!drbd_interval_empty(&req->i)) {
+ struct rb_root *root;
+
+ if (s & RQ_WRITE)
+ root = &device->write_requests;
+ else
+ root = &device->read_requests;
+ drbd_remove_request_interval(root, req);
+ } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
+ drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
+ s, (unsigned long long)req->i.sector, req->i.size);
+
+ /* if it was a write, we may have to set the corresponding
+ * bit(s) out-of-sync first. If it had a local part, we need to
+ * release the reference to the activity log. */
+ if (s & RQ_WRITE) {
+ /* Set out-of-sync unless both OK flags are set
+ * (local only or remote failed).
+ * Other places where we set out-of-sync:
+ * READ with local io-error */
+
+ /* There is a special case:
+ * we may notice late that IO was suspended,
+ * and postpone, or schedule for retry, a write,
+ * before it even was submitted or sent.
+ * In that case we do not want to touch the bitmap at all.
+ */
+ if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+ if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+ drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+
+ if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+ drbd_set_in_sync(device, req->i.sector, req->i.size);
+ }
+
+ /* one might be tempted to move the drbd_al_complete_io
+ * to the local io completion callback drbd_request_endio.
+ * but, if this was a mirror write, we may only
+ * drbd_al_complete_io after this is RQ_NET_DONE,
+ * otherwise the extent could be dropped from the al
+ * before it has actually been written on the peer.
+ * if we crash before our peer knows about the request,
+ * but after the extent has been dropped from the al,
+ * we would forget to resync the corresponding extent.
+ */
+ if (s & RQ_IN_ACT_LOG) {
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_al_complete_io(device, &req->i);
+ put_ldev(device);
+ } else if (__ratelimit(&drbd_ratelimit_state)) {
+ drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
+ "but my Disk seems to have failed :(\n",
+ (unsigned long long) req->i.sector, req->i.size);
+ }
+ }
+ }
+
+ mempool_free(req, drbd_request_mempool);
+}
+
+static void wake_all_senders(struct drbd_connection *connection)
+{
+ wake_up(&connection->sender_work.q_wait);
+}
+
+/* must hold resource->req_lock */
+void start_new_tl_epoch(struct drbd_connection *connection)
+{
+ /* no point closing an epoch, if it is empty, anyways. */
+ if (connection->current_tle_writes == 0)
+ return;
+
+ connection->current_tle_writes = 0;
+ atomic_inc(&connection->current_tle_nr);
+ wake_all_senders(connection);
+}
+
+void complete_master_bio(struct drbd_device *device,
+ struct bio_and_error *m)
+{
+ bio_endio(m->bio, m->error);
+ dec_ap_bio(device);
+}
+
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+static
+void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
+{
+ const unsigned s = req->rq_state;
+ struct drbd_device *device = req->device;
+ int rw;
+ int error, ok;
+
+ /* we must not complete the master bio, while it is
+ * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+ * not yet acknowledged by the peer
+ * not yet completed by the local io subsystem
+ * these flags may get cleared in any order by
+ * the worker,
+ * the receiver,
+ * the bio_endio completion callbacks.
+ */
+ if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+ (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+ (s & RQ_COMPLETION_SUSP)) {
+ drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
+ return;
+ }
+
+ if (!req->master_bio) {
+ drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
+ return;
+ }
+
+ rw = bio_rw(req->master_bio);
+
+ /*
+ * figure out whether to report success or failure.
+ *
+ * report success when at least one of the operations succeeded.
+ * or, to put the other way,
+ * only report failure, when both operations failed.
+ *
+ * what to do about the failures is handled elsewhere.
+ * what we need to do here is just: complete the master_bio.
+ *
+ * local completion error, if any, has been stored as ERR_PTR
+ * in private_bio within drbd_request_endio.
+ */
+ ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+ error = PTR_ERR(req->private_bio);
+
+ /* Before we can signal completion to the upper layers,
+ * we may need to close the current transfer log epoch.
+ * We are within the request lock, so we can simply compare
+ * the request epoch number with the current transfer log
+ * epoch number. If they match, increase the current_tle_nr,
+ * and reset the transfer log epoch write_cnt.
+ */
+ if (rw == WRITE &&
+ req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
+ start_new_tl_epoch(first_peer_device(device)->connection);
+
+ /* Update disk stats */
+ _drbd_end_io_acct(device, req);
+
+ /* If READ failed,
+ * have it be pushed back to the retry work queue,
+ * so it will re-enter __drbd_make_request(),
+ * and be re-assigned to a suitable local or remote path,
+ * or failed if we do not have access to good data anymore.
+ *
+ * Unless it was failed early by __drbd_make_request(),
+ * because no path was available, in which case
+ * it was not even added to the transfer_log.
+ *
+ * READA may fail, and will not be retried.
+ *
+ * WRITE should have used all available paths already.
+ */
+ if (!ok && rw == READ && !list_empty(&req->tl_requests))
+ req->rq_state |= RQ_POSTPONED;
+
+ if (!(req->rq_state & RQ_POSTPONED)) {
+ m->error = ok ? 0 : (error ?: -EIO);
+ m->bio = req->master_bio;
+ req->master_bio = NULL;
+ /* We leave it in the tree, to be able to verify later
+ * write-acks in protocol != C during resync.
+ * But we mark it as "complete", so it won't be counted as
+ * conflict in a multi-primary setup. */
+ req->i.completed = true;
+ }
+
+ if (req->i.waiting)
+ wake_up(&device->misc_wait);
+
+ /* Either we are about to complete to upper layers,
+ * or we will restart this request.
+ * In either case, the request object will be destroyed soon,
+ * so better remove it from all lists. */
+ list_del_init(&req->req_pending_master_completion);
+}
+
+/* still holds resource->req_lock */
+static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
+{
+ struct drbd_device *device = req->device;
+ D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED));
+
+ if (!atomic_sub_and_test(put, &req->completion_ref))
+ return 0;
+
+ drbd_req_complete(req, m);
+
+ if (req->rq_state & RQ_POSTPONED) {
+ /* don't destroy the req object just yet,
+ * but queue it for retry */
+ drbd_restart_request(req);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_next == NULL)
+ connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_next != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if (s & RQ_NET_QUEUED)
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_ack_pending == NULL)
+ connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_ack_pending != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_not_net_done == NULL)
+ connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_not_net_done != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_not_net_done = req;
+}
+
+/* I'd like this to be the only place that manipulates
+ * req->completion_ref and req->kref. */
+static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+ int clear, int set)
+{
+ struct drbd_device *device = req->device;
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ unsigned s = req->rq_state;
+ int c_put = 0;
+ int k_put = 0;
+
+ if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
+ set |= RQ_COMPLETION_SUSP;
+
+ /* apply */
+
+ req->rq_state &= ~clear;
+ req->rq_state |= set;
+
+ /* no change? */
+ if (req->rq_state == s)
+ return;
+
+ /* intent: get references */
+
+ if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+ atomic_inc(&req->completion_ref);
+
+ if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+ inc_ap_pending(device);
+ atomic_inc(&req->completion_ref);
+ }
+
+ if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
+ atomic_inc(&req->completion_ref);
+ set_if_null_req_next(peer_device, req);
+ }
+
+ if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+ kref_get(&req->kref); /* wait for the DONE */
+
+ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+ /* potentially already completed in the asender thread */
+ if (!(s & RQ_NET_DONE)) {
+ atomic_add(req->i.size >> 9, &device->ap_in_flight);
+ set_if_null_req_not_net_done(peer_device, req);
+ }
+ if (s & RQ_NET_PENDING)
+ set_if_null_req_ack_pending(peer_device, req);
+ }
+
+ if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+ atomic_inc(&req->completion_ref);
+
+ /* progress: put references */
+
+ if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+ ++c_put;
+
+ if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+ D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
+ /* local completion may still come in later,
+ * we need to keep the req object around. */
+ kref_get(&req->kref);
+ ++c_put;
+ }
+
+ if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+ if (req->rq_state & RQ_LOCAL_ABORTED)
+ ++k_put;
+ else
+ ++c_put;
+ list_del_init(&req->req_pending_local);
+ }
+
+ if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+ dec_ap_pending(device);
+ ++c_put;
+ req->acked_jif = jiffies;
+ advance_conn_req_ack_pending(peer_device, req);
+ }
+
+ if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
+ ++c_put;
+ advance_conn_req_next(peer_device, req);
+ }
+
+ if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+ if (s & RQ_NET_SENT)
+ atomic_sub(req->i.size >> 9, &device->ap_in_flight);
+ if (s & RQ_EXP_BARR_ACK)
+ ++k_put;
+ req->net_done_jif = jiffies;
+
+ /* in ahead/behind mode, or just in case,
+ * before we finally destroy this request,
+ * the caching pointers must not reference it anymore */
+ advance_conn_req_next(peer_device, req);
+ advance_conn_req_ack_pending(peer_device, req);
+ advance_conn_req_not_net_done(peer_device, req);
+ }
+
+ /* potentially complete and destroy */
+
+ if (k_put || c_put) {
+ /* Completion does it's own kref_put. If we are going to
+ * kref_sub below, we need req to be still around then. */
+ int at_least = k_put + !!c_put;
+ int refcount = atomic_read(&req->kref.refcount);
+ if (refcount < at_least)
+ drbd_err(device,
+ "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
+ s, req->rq_state, refcount, at_least);
+ }
+
+ /* If we made progress, retry conflicting peer requests, if any. */
+ if (req->i.waiting)
+ wake_up(&device->misc_wait);
+
+ if (c_put)
+ k_put += drbd_req_put_completion_ref(req, m, c_put);
+ if (k_put)
+ kref_sub(&req->kref, k_put, drbd_req_destroy);
+}
+
+static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
+{
+ char b[BDEVNAME_SIZE];
+
+ if (!__ratelimit(&drbd_ratelimit_state))
+ return;
+
+ drbd_warn(device, "local %s IO error sector %llu+%u on %s\n",
+ (req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
+ (unsigned long long)req->i.sector,
+ req->i.size >> 9,
+ bdevname(device->ldev->backing_bdev, b));
+}
+
+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+ return (req->rq_state &
+ (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+ == (RQ_WRITE|RQ_NET_PENDING);
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ * enforces that it is all in this one place, where it is easier to audit,
+ * it makes it obvious that whatever "event" "happens" to a request should
+ * happen "atomically" within the req_lock,
+ * and it enforces that we have to think in a very structured manner
+ * about the "events" that may happen to a request during its life time ...
+ */
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+ struct bio_and_error *m)
+{
+ struct drbd_device *const device = req->device;
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+ struct net_conf *nc;
+ int p, rv = 0;
+
+ if (m)
+ m->bio = NULL;
+
+ switch (what) {
+ default:
+ drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+ break;
+
+ /* does not happen...
+ * initialization done in drbd_req_new
+ case CREATED:
+ break;
+ */
+
+ case TO_BE_SENT: /* via network */
+ /* reached via __drbd_make_request
+ * and from w_read_retry_remote */
+ D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ p = nc->wire_protocol;
+ rcu_read_unlock();
+ req->rq_state |=
+ p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+ p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+ mod_rq_state(req, m, 0, RQ_NET_PENDING);
+ break;
+
+ case TO_BE_SUBMITTED: /* locally */
+ /* reached via __drbd_make_request */
+ D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK));
+ mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
+ break;
+
+ case COMPLETED_OK:
+ if (req->rq_state & RQ_WRITE)
+ device->writ_cnt += req->i.size >> 9;
+ else
+ device->read_cnt += req->i.size >> 9;
+
+ mod_rq_state(req, m, RQ_LOCAL_PENDING,
+ RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+ break;
+
+ case ABORT_DISK_IO:
+ mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
+ break;
+
+ case WRITE_COMPLETED_WITH_ERROR:
+ drbd_report_io_error(device, req);
+ __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
+
+ case READ_COMPLETED_WITH_ERROR:
+ drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+ drbd_report_io_error(device, req);
+ __drbd_chk_io_error(device, DRBD_READ_ERROR);
+ /* fall through. */
+ case READ_AHEAD_COMPLETED_WITH_ERROR:
+ /* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
+
+ case DISCARD_COMPLETED_NOTSUPP:
+ case DISCARD_COMPLETED_WITH_ERROR:
+ /* I'd rather not detach from local disk just because it
+ * failed a REQ_DISCARD. */
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
+
+ case QUEUE_FOR_NET_READ:
+ /* READ or READA, and
+ * no local disk,
+ * or target area marked as invalid,
+ * or just got an io-error. */
+ /* from __drbd_make_request
+ * or from bio_endio during read io-error recovery */
+
+ /* So we can verify the handle in the answer packet.
+ * Corresponding drbd_remove_request_interval is in
+ * drbd_req_complete() */
+ D_ASSERT(device, drbd_interval_empty(&req->i));
+ drbd_insert_interval(&device->read_requests, &req->i);
+
+ set_bit(UNPLUG_REMOTE, &device->flags);
+
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+ req->w.cb = w_send_read_req;
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+ break;
+
+ case QUEUE_FOR_NET_WRITE:
+ /* assert something? */
+ /* from __drbd_make_request only */
+
+ /* Corresponding drbd_remove_request_interval is in
+ * drbd_req_complete() */
+ D_ASSERT(device, drbd_interval_empty(&req->i));
+ drbd_insert_interval(&device->write_requests, &req->i);
+
+ /* NOTE
+ * In case the req ended up on the transfer log before being
+ * queued on the worker, it could lead to this request being
+ * missed during cleanup after connection loss.
+ * So we have to do both operations here,
+ * within the same lock that protects the transfer log.
+ *
+ * _req_add_to_epoch(req); this has to be after the
+ * _maybe_start_new_epoch(req); which happened in
+ * __drbd_make_request, because we now may set the bit
+ * again ourselves to close the current epoch.
+ *
+ * Add req to the (now) current epoch (barrier). */
+
+ /* otherwise we may lose an unplug, which may cause some remote
+ * io-scheduler timeout to expire, increasing maximum latency,
+ * hurting performance. */
+ set_bit(UNPLUG_REMOTE, &device->flags);
+
+ /* queue work item to send data */
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
+ req->w.cb = w_send_dblock;
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+
+ /* close the epoch, in case it outgrew the limit */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ p = nc->max_epoch_size;
+ rcu_read_unlock();
+ if (connection->current_tle_writes >= p)
+ start_new_tl_epoch(connection);
+
+ break;
+
+ case QUEUE_FOR_SEND_OOS:
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+ req->w.cb = w_send_out_of_sync;
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+ break;
+
+ case READ_RETRY_REMOTE_CANCELED:
+ case SEND_CANCELED:
+ case SEND_FAILED:
+ /* real cleanup will be done from tl_clear. just update flags
+ * so it is no longer marked as on the worker queue */
+ mod_rq_state(req, m, RQ_NET_QUEUED, 0);
+ break;
+
+ case HANDED_OVER_TO_NETWORK:
+ /* assert something? */
+ if (is_pending_write_protocol_A(req))
+ /* this is what is dangerous about protocol A:
+ * pretend it was successfully written on the peer. */
+ mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+ RQ_NET_SENT|RQ_NET_OK);
+ else
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+ /* It is still not yet RQ_NET_DONE until the
+ * corresponding epoch barrier got acked as well,
+ * so we know what to dirty on connection loss. */
+ break;
+
+ case OOS_HANDED_TO_NETWORK:
+ /* Was not set PENDING, no longer QUEUED, so is now DONE
+ * as far as this connection is concerned. */
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
+ break;
+
+ case CONNECTION_LOST_WHILE_PENDING:
+ /* transfer log cleanup after connection loss */
+ mod_rq_state(req, m,
+ RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+ RQ_NET_DONE);
+ break;
+
+ case CONFLICT_RESOLVED:
+ /* for superseded conflicting writes of multiple primaries,
+ * there is no need to keep anything in the tl, potential
+ * node crashes are covered by the activity log.
+ *
+ * If this request had been marked as RQ_POSTPONED before,
+ * it will actually not be completed, but "restarted",
+ * resubmitted from the retry worker context. */
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+ break;
+
+ case WRITE_ACKED_BY_PEER_AND_SIS:
+ req->rq_state |= RQ_NET_SIS;
+ case WRITE_ACKED_BY_PEER:
+ /* Normal operation protocol C: successfully written on peer.
+ * During resync, even in protocol != C,
+ * we requested an explicit write ack anyways.
+ * Which means we cannot even assert anything here.
+ * Nothing more to do here.
+ * We want to keep the tl in place for all protocols, to cater
+ * for volatile write-back caches on lower level devices. */
+ goto ack_common;
+ case RECV_ACKED_BY_PEER:
+ D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
+ /* protocol B; pretends to be successfully written on peer.
+ * see also notes above in HANDED_OVER_TO_NETWORK about
+ * protocol != C */
+ ack_common:
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+ break;
+
+ case POSTPONE_WRITE:
+ D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+ /* If this node has already detected the write conflict, the
+ * worker will be waiting on misc_wait. Wake it up once this
+ * request has completed locally.
+ */
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ req->rq_state |= RQ_POSTPONED;
+ if (req->i.waiting)
+ wake_up(&device->misc_wait);
+ /* Do not clear RQ_NET_PENDING. This request will make further
+ * progress via restart_conflicting_writes() or
+ * fail_postponed_requests(). Hopefully. */
+ break;
+
+ case NEG_ACKED:
+ mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
+ break;
+
+ case FAIL_FROZEN_DISK_IO:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+ break;
+
+ case RESTART_FROZEN_DISK_IO:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+
+ mod_rq_state(req, m,
+ RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+ RQ_LOCAL_PENDING);
+
+ rv = MR_READ;
+ if (bio_data_dir(req->master_bio) == WRITE)
+ rv = MR_WRITE;
+
+ get_ldev(device); /* always succeeds in this call path */
+ req->w.cb = w_restart_disk_io;
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+ break;
+
+ case RESEND:
+ /* Simply complete (local only) READs. */
+ if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+ break;
+ }
+
+ /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+ before the connection loss (B&C only); only P_BARRIER_ACK
+ (or the local completion?) was missing when we suspended.
+ Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+ During connection handshake, we ensure that the peer was not rebooted. */
+ if (!(req->rq_state & RQ_NET_OK)) {
+ /* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync?
+ * in that case we must not set RQ_NET_PENDING. */
+
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
+ if (req->w.cb) {
+ /* w.cb expected to be w_send_dblock, or w_send_read_req */
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+ rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+ } /* else: FIXME can this happen? */
+ break;
+ }
+ /* else, fall through to BARRIER_ACKED */
+
+ case BARRIER_ACKED:
+ /* barrier ack for READ requests does not make sense */
+ if (!(req->rq_state & RQ_WRITE))
+ break;
+
+ if (req->rq_state & RQ_NET_PENDING) {
+ /* barrier came in before all requests were acked.
+ * this is bad, because if the connection is lost now,
+ * we won't be able to clean them up... */
+ drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n");
+ }
+ /* Allowed to complete requests, even while suspended.
+ * As this is called for all requests within a matching epoch,
+ * we need to filter, and only set RQ_NET_DONE for those that
+ * have actually been on the wire. */
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+ (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
+ break;
+
+ case DATA_RECEIVED:
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
+ break;
+
+ case QUEUE_AS_DRBD_BARRIER:
+ start_new_tl_epoch(connection);
+ mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
+ break;
+ };
+
+ return rv;
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ * BUT we are still/already IN SYNC for this area.
+ * since size may be bigger than BM_BLOCK_SIZE,
+ * we may need to check several bits.
+ */
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size)
+{
+ unsigned long sbnr, ebnr;
+ sector_t esector, nr_sectors;
+
+ if (device->state.disk == D_UP_TO_DATE)
+ return true;
+ if (device->state.disk != D_INCONSISTENT)
+ return false;
+ esector = sector + (size >> 9) - 1;
+ nr_sectors = drbd_get_capacity(device->this_bdev);
+ D_ASSERT(device, sector < nr_sectors);
+ D_ASSERT(device, esector < nr_sectors);
+
+ sbnr = BM_SECT_TO_BIT(sector);
+ ebnr = BM_SECT_TO_BIT(esector);
+
+ return drbd_bm_count_bits(device, sbnr, ebnr) == 0;
+}
+
+static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector,
+ enum drbd_read_balancing rbm)
+{
+ struct backing_dev_info *bdi;
+ int stripe_shift;
+
+ switch (rbm) {
+ case RB_CONGESTED_REMOTE:
+ bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+ return bdi_read_congested(bdi);
+ case RB_LEAST_PENDING:
+ return atomic_read(&device->local_cnt) >
+ atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
+ case RB_32K_STRIPING: /* stripe_shift = 15 */
+ case RB_64K_STRIPING:
+ case RB_128K_STRIPING:
+ case RB_256K_STRIPING:
+ case RB_512K_STRIPING:
+ case RB_1M_STRIPING: /* stripe_shift = 20 */
+ stripe_shift = (rbm - RB_32K_STRIPING + 15);
+ return (sector >> (stripe_shift - 9)) & 1;
+ case RB_ROUND_ROBIN:
+ return test_and_change_bit(READ_BALANCE_RR, &device->flags);
+ case RB_PREFER_REMOTE:
+ return true;
+ case RB_PREFER_LOCAL:
+ default:
+ return false;
+ }
+}
+
+/*
+ * complete_conflicting_writes - wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about. Wait for any requests to complete which conflict with
+ * the new one.
+ *
+ * Only way out: remove the conflicting intervals from the tree.
+ */
+static void complete_conflicting_writes(struct drbd_request *req)
+{
+ DEFINE_WAIT(wait);
+ struct drbd_device *device = req->device;
+ struct drbd_interval *i;
+ sector_t sector = req->i.sector;
+ int size = req->i.size;
+
+ i = drbd_find_overlap(&device->write_requests, sector, size);
+ if (!i)
+ return;
+
+ for (;;) {
+ prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+ i = drbd_find_overlap(&device->write_requests, sector, size);
+ if (!i)
+ break;
+ /* Indicate to wake up device->misc_wait on progress. */
+ i->waiting = true;
+ spin_unlock_irq(&device->resource->req_lock);
+ schedule();
+ spin_lock_irq(&device->resource->req_lock);
+ }
+ finish_wait(&device->misc_wait, &wait);
+}
+
+/* called within req_lock and rcu_read_lock() */
+static void maybe_pull_ahead(struct drbd_device *device)
+{
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct net_conf *nc;
+ bool congested = false;
+ enum drbd_on_congestion on_congestion;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+ rcu_read_unlock();
+ if (on_congestion == OC_BLOCK ||
+ connection->agreed_pro_version < 96)
+ return;
+
+ if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
+ return; /* nothing to do ... */
+
+ /* If I don't even have good local storage, we can not reasonably try
+ * to pull ahead of the peer. We also need the local reference to make
+ * sure device->act_log is there.
+ */
+ if (!get_ldev_if_state(device, D_UP_TO_DATE))
+ return;
+
+ if (nc->cong_fill &&
+ atomic_read(&device->ap_in_flight) >= nc->cong_fill) {
+ drbd_info(device, "Congestion-fill threshold reached\n");
+ congested = true;
+ }
+
+ if (device->act_log->used >= nc->cong_extents) {
+ drbd_info(device, "Congestion-extents threshold reached\n");
+ congested = true;
+ }
+
+ if (congested) {
+ /* start a new epoch for non-mirrored writes */
+ start_new_tl_epoch(first_peer_device(device)->connection);
+
+ if (on_congestion == OC_PULL_AHEAD)
+ _drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL);
+ else /*nc->on_congestion == OC_DISCONNECT */
+ _drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL);
+ }
+ put_ldev(device);
+}
+
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ enum drbd_read_balancing rbm;
+
+ if (req->private_bio) {
+ if (!drbd_may_do_local_read(device,
+ req->i.sector, req->i.size)) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ }
+ }
+
+ if (device->state.pdsk != D_UP_TO_DATE)
+ return false;
+
+ if (req->private_bio == NULL)
+ return true;
+
+ /* TODO: improve read balancing decisions, take into account drbd
+ * protocol, pending requests etc. */
+
+ rcu_read_lock();
+ rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing;
+ rcu_read_unlock();
+
+ if (rbm == RB_PREFER_LOCAL && req->private_bio)
+ return false; /* submit locally */
+
+ if (remote_due_to_read_balancing(device, req->i.sector, rbm)) {
+ if (req->private_bio) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ int remote, send_oos;
+
+ remote = drbd_should_do_remote(device->state);
+ send_oos = drbd_should_send_out_of_sync(device->state);
+
+ /* Need to replicate writes. Unless it is an empty flush,
+ * which is better mapped to a DRBD P_BARRIER packet,
+ * also for drbd wire protocol compatibility reasons.
+ * If this was a flush, just start a new epoch.
+ * Unless the current epoch was empty anyways, or we are not currently
+ * replicating, in which case there is no point. */
+ if (unlikely(req->i.size == 0)) {
+ /* The only size==0 bios we expect are empty flushes. */
+ D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH);
+ if (remote)
+ _req_mod(req, QUEUE_AS_DRBD_BARRIER);
+ return remote;
+ }
+
+ if (!remote && !send_oos)
+ return 0;
+
+ D_ASSERT(device, !(remote && send_oos));
+
+ if (remote) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_WRITE);
+ } else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size))
+ _req_mod(req, QUEUE_FOR_SEND_OOS);
+
+ return remote;
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ struct bio *bio = req->private_bio;
+ const int rw = bio_rw(bio);
+
+ bio->bi_bdev = device->ldev->backing_bdev;
+
+ /* State may have changed since we grabbed our reference on the
+ * ->ldev member. Double check, and short-circuit to endio.
+ * In case the last activity log transaction failed to get on
+ * stable storage, and this is a WRITE, we may not even submit
+ * this bio. */
+ if (get_ldev(device)) {
+ req->pre_submit_jif = jiffies;
+ if (drbd_insert_fault(device,
+ rw == WRITE ? DRBD_FAULT_DT_WR
+ : rw == READ ? DRBD_FAULT_DT_RD
+ : DRBD_FAULT_DT_RA))
+ bio_endio(bio, -EIO);
+ else
+ generic_make_request(bio);
+ put_ldev(device);
+ } else
+ bio_endio(bio, -EIO);
+}
+
+static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
+{
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&req->tl_requests, &device->submit.writes);
+ list_add_tail(&req->req_pending_master_completion,
+ &device->pending_master_completion[1 /* WRITE */]);
+ spin_unlock_irq(&device->resource->req_lock);
+ queue_work(device->submit.wq, &device->submit.worker);
+ /* do_submit() may sleep internally on al_wait, too */
+ wake_up(&device->al_wait);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+static struct drbd_request *
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
+{
+ const int rw = bio_data_dir(bio);
+ struct drbd_request *req;
+
+ /* allocate outside of all locks; */
+ req = drbd_req_new(device, bio);
+ if (!req) {
+ dec_ap_bio(device);
+ /* only pass the error to the upper layers.
+ * if user cannot handle io errors, that's not our business. */
+ drbd_err(device, "could not kmalloc() req\n");
+ bio_endio(bio, -ENOMEM);
+ return ERR_PTR(-ENOMEM);
+ }
+ req->start_jif = start_jif;
+
+ if (!get_ldev(device)) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ }
+
+ /* Update disk stats */
+ _drbd_start_io_acct(device, req);
+
+ if (rw == WRITE && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &device->flags)) {
+ if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+ atomic_inc(&device->ap_actlog_cnt);
+ drbd_queue_write(device, req);
+ return NULL;
+ }
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
+ }
+
+ return req;
+}
+
+static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
+{
+ struct drbd_resource *resource = device->resource;
+ const int rw = bio_rw(req->master_bio);
+ struct bio_and_error m = { NULL, };
+ bool no_remote = false;
+ bool submit_private_bio = false;
+
+ spin_lock_irq(&resource->req_lock);
+ if (rw == WRITE) {
+ /* This may temporarily give up the req_lock,
+ * but will re-aquire it before it returns here.
+ * Needs to be before the check on drbd_suspended() */
+ complete_conflicting_writes(req);
+ /* no more giving up req_lock from now on! */
+
+ /* check for congestion, and potentially stop sending
+ * full data updates, but start sending "dirty bits" only. */
+ maybe_pull_ahead(device);
+ }
+
+
+ if (drbd_suspended(device)) {
+ /* push back and retry: */
+ req->rq_state |= RQ_POSTPONED;
+ if (req->private_bio) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ }
+ goto out;
+ }
+
+ /* We fail READ/READA early, if we can not serve it.
+ * We must do this before req is registered on any lists.
+ * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+ if (rw != WRITE) {
+ if (!do_remote_read(req) && !req->private_bio)
+ goto nodata;
+ }
+
+ /* which transfer log epoch does this belong to? */
+ req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr);
+
+ /* no point in adding empty flushes to the transfer log,
+ * they are mapped to drbd barriers already. */
+ if (likely(req->i.size!=0)) {
+ if (rw == WRITE)
+ first_peer_device(device)->connection->current_tle_writes++;
+
+ list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log);
+ }
+
+ if (rw == WRITE) {
+ if (!drbd_process_write_request(req))
+ no_remote = true;
+ } else {
+ /* We either have a private_bio, or we can read from remote.
+ * Otherwise we had done the goto nodata above. */
+ if (req->private_bio == NULL) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_READ);
+ } else
+ no_remote = true;
+ }
+
+ /* If it took the fast path in drbd_request_prepare, add it here.
+ * The slow path has added it already. */
+ if (list_empty(&req->req_pending_master_completion))
+ list_add_tail(&req->req_pending_master_completion,
+ &device->pending_master_completion[rw == WRITE]);
+ if (req->private_bio) {
+ /* needs to be marked within the same spinlock */
+ list_add_tail(&req->req_pending_local,
+ &device->pending_completion[rw == WRITE]);
+ _req_mod(req, TO_BE_SUBMITTED);
+ /* but we need to give up the spinlock to submit */
+ submit_private_bio = true;
+ } else if (no_remote) {
+nodata:
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+ (unsigned long long)req->i.sector, req->i.size >> 9);
+ /* A write may have been queued for send_oos, however.
+ * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
+ }
+
+out:
+ if (drbd_req_put_completion_ref(req, &m, 1))
+ kref_put(&req->kref, drbd_req_destroy);
+ spin_unlock_irq(&resource->req_lock);
+
+ /* Even though above is a kref_put(), this is safe.
+ * As long as we still need to submit our private bio,
+ * we hold a completion ref, and the request cannot disappear.
+ * If however this request did not even have a private bio to submit
+ * (e.g. remote read), req may already be invalid now.
+ * That's why we cannot check on req->private_bio. */
+ if (submit_private_bio)
+ drbd_submit_req_private_bio(req);
+ if (m.bio)
+ complete_master_bio(device, &m);
+}
+
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
+{
+ struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
+ if (IS_ERR_OR_NULL(req))
+ return;
+ drbd_send_and_submit(device, req);
+}
+
+static void submit_fast_path(struct drbd_device *device, struct list_head *incoming)
+{
+ struct drbd_request *req, *tmp;
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ const int rw = bio_data_dir(req->master_bio);
+
+ if (rw == WRITE /* rw != WRITE should not even end up here! */
+ && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &device->flags)) {
+ if (!drbd_al_begin_io_fastpath(device, &req->i))
+ continue;
+
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
+ atomic_dec(&device->ap_actlog_cnt);
+ }
+
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(device, req);
+ }
+}
+
+static bool prepare_al_transaction_nonblock(struct drbd_device *device,
+ struct list_head *incoming,
+ struct list_head *pending,
+ struct list_head *later)
+{
+ struct drbd_request *req, *tmp;
+ int wake = 0;
+ int err;
+
+ spin_lock_irq(&device->al_lock);
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ err = drbd_al_begin_io_nonblock(device, &req->i);
+ if (err == -ENOBUFS)
+ break;
+ if (err == -EBUSY)
+ wake = 1;
+ if (err)
+ list_move_tail(&req->tl_requests, later);
+ else
+ list_move_tail(&req->tl_requests, pending);
+ }
+ spin_unlock_irq(&device->al_lock);
+ if (wake)
+ wake_up(&device->al_wait);
+ return !list_empty(pending);
+}
+
+void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+ struct drbd_request *req, *tmp;
+
+ list_for_each_entry_safe(req, tmp, pending, tl_requests) {
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
+ atomic_dec(&device->ap_actlog_cnt);
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(device, req);
+ }
+}
+
+void do_submit(struct work_struct *ws)
+{
+ struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
+ LIST_HEAD(incoming); /* from drbd_make_request() */
+ LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */
+ LIST_HEAD(busy); /* blocked by resync requests */
+
+ /* grab new incoming requests */
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_tail_init(&device->submit.writes, &incoming);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ for (;;) {
+ DEFINE_WAIT(wait);
+
+ /* move used-to-be-busy back to front of incoming */
+ list_splice_init(&busy, &incoming);
+ submit_fast_path(device, &incoming);
+ if (list_empty(&incoming))
+ break;
+
+ for (;;) {
+ prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ list_splice_init(&busy, &incoming);
+ prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+ if (!list_empty(&pending))
+ break;
+
+ schedule();
+
+ /* If all currently "hot" activity log extents are kept busy by
+ * incoming requests, we still must not totally starve new
+ * requests to "cold" extents.
+ * Something left on &incoming means there had not been
+ * enough update slots available, and the activity log
+ * has been marked as "starving".
+ *
+ * Try again now, without looking for new requests,
+ * effectively blocking all new requests until we made
+ * at least _some_ progress with what we currently have.
+ */
+ if (!list_empty(&incoming))
+ continue;
+
+ /* Nothing moved to pending, but nothing left
+ * on incoming: all moved to busy!
+ * Grab new and iterate. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_tail_init(&device->submit.writes, &incoming);
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+ finish_wait(&device->al_wait, &wait);
+
+ /* If the transaction was full, before all incoming requests
+ * had been processed, skip ahead to commit, and iterate
+ * without splicing in more incoming requests from upper layers.
+ *
+ * Else, if all incoming have been processed,
+ * they have become either "pending" (to be submitted after
+ * next transaction commit) or "busy" (blocked by resync).
+ *
+ * Maybe more was queued, while we prepared the transaction?
+ * Try to stuff those into this transaction as well.
+ * Be strictly non-blocking here,
+ * we already have something to commit.
+ *
+ * Commit if we don't make any more progres.
+ */
+
+ while (list_empty(&incoming)) {
+ LIST_HEAD(more_pending);
+ LIST_HEAD(more_incoming);
+ bool made_progress;
+
+ /* It is ok to look outside the lock,
+ * it's only an optimization anyways */
+ if (list_empty(&device->submit.writes))
+ break;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_tail_init(&device->submit.writes, &more_incoming);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (list_empty(&more_incoming))
+ break;
+
+ made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
+
+ list_splice_tail_init(&more_pending, &pending);
+ list_splice_tail_init(&more_incoming, &incoming);
+ if (!made_progress)
+ break;
+ }
+
+ drbd_al_begin_io_commit(device);
+ send_and_submit_pending(device, &pending);
+ }
+}
+
+void drbd_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct drbd_device *device = (struct drbd_device *) q->queuedata;
+ unsigned long start_jif;
+
+ start_jif = jiffies;
+
+ /*
+ * what we "blindly" assume:
+ */
+ D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
+
+ inc_ap_bio(device);
+ __drbd_make_request(device, bio, start_jif);
+}
+
+/* This is called by bio_add_page().
+ *
+ * q->max_hw_sectors and other global limits are already enforced there.
+ *
+ * We need to call down to our lower level device,
+ * in case it has special restrictions.
+ *
+ * We also may need to enforce configured max-bio-bvecs limits.
+ *
+ * As long as the BIO is empty we have to allow at least one bvec,
+ * regardless of size and offset, so no need to ask lower levels.
+ */
+int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+{
+ struct drbd_device *device = (struct drbd_device *) q->queuedata;
+ unsigned int bio_size = bvm->bi_size;
+ int limit = DRBD_MAX_BIO_SIZE;
+ int backing_limit;
+
+ if (bio_size && get_ldev(device)) {
+ unsigned int max_hw_sectors = queue_max_hw_sectors(q);
+ struct request_queue * const b =
+ device->ldev->backing_bdev->bd_disk->queue;
+ if (b->merge_bvec_fn) {
+ bvm->bi_bdev = device->ldev->backing_bdev;
+ backing_limit = b->merge_bvec_fn(b, bvm, bvec);
+ limit = min(limit, backing_limit);
+ }
+ put_ldev(device);
+ if ((limit >> 9) > max_hw_sectors)
+ limit = max_hw_sectors << 9;
+ }
+ return limit;
+}
+
+void request_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
+ struct net_conf *nc;
+ unsigned long oldest_submit_jif;
+ unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+ unsigned long now;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
+ ent = nc->timeout * HZ/10 * nc->ko_count;
+
+ if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
+ dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
+ put_ldev(device);
+ }
+ rcu_read_unlock();
+
+ et = min_not_zero(dt, ent);
+
+ if (!et)
+ return; /* Recurring timer stopped */
+
+ now = jiffies;
+ nt = now + et;
+
+ spin_lock_irq(&device->resource->req_lock);
+ req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+ req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+ req_peer = connection->req_not_net_done;
+ /* maybe the oldest request waiting for the peer is in fact still
+ * blocking in tcp sendmsg */
+ if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+ req_peer = connection->req_next;
+
+ /* evaluate the oldest peer request only in one timer! */
+ if (req_peer && req_peer->device != device)
+ req_peer = NULL;
+
+ /* do we have something to evaluate? */
+ if (req_peer == NULL && req_write == NULL && req_read == NULL)
+ goto out;
+
+ oldest_submit_jif =
+ (req_write && req_read)
+ ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+ ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+ : req_write ? req_write->pre_submit_jif
+ : req_read ? req_read->pre_submit_jif : now;
+
+ /* The request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ * with above state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ * resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ * for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ * !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+ if (ent && req_peer &&
+ time_after(now, req_peer->pre_send_jif + ent) &&
+ !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
+ drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+ _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
+ }
+ if (dt && oldest_submit_jif != now &&
+ time_after(now, oldest_submit_jif + dt) &&
+ !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
+ drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
+ __drbd_chk_io_error(device, DRBD_FORCE_DETACH);
+ }
+
+ /* Reschedule timer for the nearest not already expired timeout.
+ * Fallback to now + min(effective network timeout, disk timeout). */
+ ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+ ? req_peer->pre_send_jif + ent : now + et;
+ dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+ ? oldest_submit_jif + dt : now + et;
+ nt = time_before(ent, dt) ? ent : dt;
+out:
+ spin_unlock_irq(&device->resource->req_lock);
+ mod_timer(&device->request_timer, nt);
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000..9f6a04080
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,351 @@
+/*
+ drbd_req.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+ Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+ DRBD is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ DRBD is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+ and in Softirqs/Tasklets/BH context by the SCSI drivers,
+ and by the receiver and worker in kernel-thread context.
+ Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ * It will be created.
+ * It will be marked with the intention to be
+ * submitted to local disk and/or
+ * send via the network.
+ *
+ * It has to be placed on the transfer log and other housekeeping lists,
+ * In case we have a network connection.
+ *
+ * It may be identified as a concurrent (write) request
+ * and be handled accordingly.
+ *
+ * It may me handed over to the local disk subsystem.
+ * It may be completed by the local disk subsystem,
+ * either successfully or with io-error.
+ * In case it is a READ request, and it failed locally,
+ * it may be retried remotely.
+ *
+ * It may be queued for sending.
+ * It may be handed over to the network stack,
+ * which may fail.
+ * It may be acknowledged by the "peer" according to the wire_protocol in use.
+ * this may be a negative ack.
+ * It may receive a faked ack when the network connection is lost and the
+ * transfer log is cleaned up.
+ * Sending may be canceled due to network connection loss.
+ * When it finally has outlived its time,
+ * corresponding dirty bits in the resync-bitmap may be cleared or set,
+ * it will be destroyed,
+ * and completion will be signalled to the originator,
+ * with or without "success".
+ */
+
+enum drbd_req_event {
+ CREATED,
+ TO_BE_SENT,
+ TO_BE_SUBMITTED,
+
+ /* XXX yes, now I am inconsistent...
+ * these are not "events" but "actions"
+ * oh, well... */
+ QUEUE_FOR_NET_WRITE,
+ QUEUE_FOR_NET_READ,
+ QUEUE_FOR_SEND_OOS,
+
+ /* An empty flush is queued as P_BARRIER,
+ * which will cause it to complete "successfully",
+ * even if the local disk flush failed.
+ *
+ * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
+ * only see an error if neither local nor remote data is reachable. */
+ QUEUE_AS_DRBD_BARRIER,
+
+ SEND_CANCELED,
+ SEND_FAILED,
+ HANDED_OVER_TO_NETWORK,
+ OOS_HANDED_TO_NETWORK,
+ CONNECTION_LOST_WHILE_PENDING,
+ READ_RETRY_REMOTE_CANCELED,
+ RECV_ACKED_BY_PEER,
+ WRITE_ACKED_BY_PEER,
+ WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
+ CONFLICT_RESOLVED,
+ POSTPONE_WRITE,
+ NEG_ACKED,
+ BARRIER_ACKED, /* in protocol A and B */
+ DATA_RECEIVED, /* (remote read) */
+
+ COMPLETED_OK,
+ READ_COMPLETED_WITH_ERROR,
+ READ_AHEAD_COMPLETED_WITH_ERROR,
+ WRITE_COMPLETED_WITH_ERROR,
+ DISCARD_COMPLETED_NOTSUPP,
+ DISCARD_COMPLETED_WITH_ERROR,
+
+ ABORT_DISK_IO,
+ RESEND,
+ FAIL_FROZEN_DISK_IO,
+ RESTART_FROZEN_DISK_IO,
+ NOTHING,
+};
+
+/* encoding of request states for now. we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+ /* 3210
+ * 0000: no local possible
+ * 0001: to be submitted
+ * UNUSED, we could map: 011: submitted, completion still pending
+ * 0110: completed ok
+ * 0010: completed with error
+ * 1001: Aborted (before completion)
+ * 1x10: Aborted and completed -> free
+ */
+ __RQ_LOCAL_PENDING,
+ __RQ_LOCAL_COMPLETED,
+ __RQ_LOCAL_OK,
+ __RQ_LOCAL_ABORTED,
+
+ /* 87654
+ * 00000: no network possible
+ * 00001: to be send
+ * 00011: to be send, on worker queue
+ * 00101: sent, expecting recv_ack (B) or write_ack (C)
+ * 11101: sent,
+ * recv_ack (B) or implicit "ack" (A),
+ * still waiting for the barrier ack.
+ * master_bio may already be completed and invalidated.
+ * 11100: write acked (C),
+ * data received (for remote read, any protocol)
+ * or finally the barrier ack has arrived (B,A)...
+ * request can be freed
+ * 01100: neg-acked (write, protocol C)
+ * or neg-d-acked (read, any protocol)
+ * or killed from the transfer log
+ * during cleanup after connection loss
+ * request can be freed
+ * 01000: canceled or send failed...
+ * request can be freed
+ */
+
+ /* if "SENT" is not set, yet, this can still fail or be canceled.
+ * if "SENT" is set already, we still wait for an Ack packet.
+ * when cleared, the master_bio may be completed.
+ * in (B,A) the request object may still linger on the transaction log
+ * until the corresponding barrier ack comes in */
+ __RQ_NET_PENDING,
+
+ /* If it is QUEUED, and it is a WRITE, it is also registered in the
+ * transfer log. Currently we need this flag to avoid conflicts between
+ * worker canceling the request and tl_clear_barrier killing it from
+ * transfer log. We should restructure the code so this conflict does
+ * no longer occur. */
+ __RQ_NET_QUEUED,
+
+ /* well, actually only "handed over to the network stack".
+ *
+ * TODO can potentially be dropped because of the similar meaning
+ * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+ * however it is not exactly the same. before we drop it
+ * we must ensure that we can tell a request with network part
+ * from a request without, regardless of what happens to it. */
+ __RQ_NET_SENT,
+
+ /* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+ * basically this means the corresponding P_BARRIER_ACK was received */
+ __RQ_NET_DONE,
+
+ /* whether or not we know (C) or pretend (B,A) that the write
+ * was successfully written on the peer.
+ */
+ __RQ_NET_OK,
+
+ /* peer called drbd_set_in_sync() for this write */
+ __RQ_NET_SIS,
+
+ /* keep this last, its for the RQ_NET_MASK */
+ __RQ_NET_MAX,
+
+ /* Set when this is a write, clear for a read */
+ __RQ_WRITE,
+
+ /* Should call drbd_al_complete_io() for this request... */
+ __RQ_IN_ACT_LOG,
+
+ /* The peer has sent a retry ACK */
+ __RQ_POSTPONED,
+
+ /* would have been completed,
+ * but was not, because of drbd_suspended() */
+ __RQ_COMPLETION_SUSP,
+
+ /* We expect a receive ACK (wire proto B) */
+ __RQ_EXP_RECEIVE_ACK,
+
+ /* We expect a write ACK (wite proto C) */
+ __RQ_EXP_WRITE_ACK,
+
+ /* waiting for a barrier ack, did an extra kref_get */
+ __RQ_EXP_BARR_ACK,
+};
+
+#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED)
+
+#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1)
+
+#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
+
+/* 0x1f8 */
+#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+#define RQ_WRITE (1UL << __RQ_WRITE)
+#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
+#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
+#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
+
+/* For waking up the frozen transfer log mod_req() has to return if the request
+ should be counted in the epoch object*/
+#define MR_WRITE 1
+#define MR_READ 2
+
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+ struct bio *bio;
+ bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+ req->private_bio = bio;
+
+ bio->bi_private = req;
+ bio->bi_end_io = drbd_request_endio;
+ bio->bi_next = NULL;
+}
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_iter.bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+ struct bio *bio;
+ int error;
+};
+
+extern void start_new_tl_epoch(struct drbd_connection *connection);
+extern void drbd_req_destroy(struct kref *kref);
+extern void _req_may_be_done(struct drbd_request *req,
+ struct bio_and_error *m);
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+ struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_device *device,
+ struct bio_and_error *m);
+extern void request_timer_fn(unsigned long data);
+extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void tl_abort_disk_io(struct drbd_device *device);
+
+/* this is in drbd_main.c */
+extern void drbd_restart_request(struct drbd_request *req);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+ struct drbd_device *device = req->device;
+ struct bio_and_error m;
+ int rv;
+
+ /* __req_mod possibly frees req, do not touch req after that! */
+ rv = __req_mod(req, what, &m);
+ if (m.bio)
+ complete_master_bio(device, &m);
+
+ return rv;
+}
+
+/* completion of master bio is outside of our spinlock.
+ * We still may or may not be inside some irqs disabled section
+ * of the lower level driver completion callback, so we need to
+ * spin_lock_irqsave here. */
+static inline int req_mod(struct drbd_request *req,
+ enum drbd_req_event what)
+{
+ unsigned long flags;
+ struct drbd_device *device = req->device;
+ struct bio_and_error m;
+ int rv;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ rv = __req_mod(req, what, &m);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (m.bio)
+ complete_master_bio(device, &m);
+
+ return rv;
+}
+
+static inline bool drbd_should_do_remote(union drbd_dev_state s)
+{
+ return s.pdsk == D_UP_TO_DATE ||
+ (s.pdsk >= D_INCONSISTENT &&
+ s.conn >= C_WF_BITMAP_T &&
+ s.conn < C_AHEAD);
+ /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+ That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+ states. */
+}
+static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+ return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+ /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+ since we enter state C_AHEAD only if proto >= 96 */
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
new file mode 100644
index 000000000..2d7dd269b
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.c
@@ -0,0 +1,1918 @@
+/*
+ drbd_state.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+ from Logicworks, Inc. for making SDP replication support possible.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+struct after_state_chg_work {
+ struct drbd_work w;
+ struct drbd_device *device;
+ union drbd_state os;
+ union drbd_state ns;
+ enum chg_state_flags flags;
+ struct completion *done;
+};
+
+enum sanitize_state_warnings {
+ NO_WARNING,
+ ABORTED_ONLINE_VERIFY,
+ ABORTED_RESYNC,
+ CONNECTION_LOST_NEGOTIATING,
+ IMPLICITLY_UPGRADED_DISK,
+ IMPLICITLY_UPGRADED_PDSK,
+};
+
+static int w_after_state_ch(struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags);
+static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
+static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
+static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum sanitize_state_warnings *warn);
+
+static inline bool is_susp(union drbd_state s)
+{
+ return s.susp || s.susp_nod || s.susp_fen;
+}
+
+bool conn_all_vols_unconf(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ bool rv = true;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.disk != D_DISKLESS ||
+ device->state.conn != C_STANDALONE ||
+ device->state.role != R_SECONDARY) {
+ rv = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/* Unfortunately the states where not correctly ordered, when
+ they where defined. therefore can not use max_t() here. */
+static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+{
+ if (role1 == R_PRIMARY || role2 == R_PRIMARY)
+ return R_PRIMARY;
+ if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+ return R_SECONDARY;
+ return R_UNKNOWN;
+}
+static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+{
+ if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
+ return R_UNKNOWN;
+ if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+ return R_SECONDARY;
+ return R_PRIMARY;
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection)
+{
+ enum drbd_role role = R_UNKNOWN;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ role = max_role(role, device->state.role);
+ }
+ rcu_read_unlock();
+
+ return role;
+}
+
+enum drbd_role conn_highest_peer(struct drbd_connection *connection)
+{
+ enum drbd_role peer = R_UNKNOWN;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ peer = max_role(peer, device->state.peer);
+ }
+ rcu_read_unlock();
+
+ return peer;
+}
+
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state disk_state = D_DISKLESS;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk);
+ }
+ rcu_read_unlock();
+
+ return disk_state;
+}
+
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state disk_state = D_MASK;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
+ }
+ rcu_read_unlock();
+
+ return disk_state;
+}
+
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state disk_state = D_DISKLESS;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk);
+ }
+ rcu_read_unlock();
+
+ return disk_state;
+}
+
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection)
+{
+ enum drbd_conns conn = C_MASK;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ conn = min_t(enum drbd_conns, conn, device->state.conn);
+ }
+ rcu_read_unlock();
+
+ return conn;
+}
+
+static bool no_peer_wf_report_params(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+ bool rv = true;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) {
+ rv = false;
+ break;
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static void wake_up_all_devices(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ wake_up(&peer_device->device->state_wait);
+ rcu_read_unlock();
+
+}
+
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @device: DRBD device.
+ * @os: old (current) state.
+ * @ns: new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_device *device,
+ union drbd_state os, union drbd_state ns)
+{
+ return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+ ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+ (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+ (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+ (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+ (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
+ (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+}
+
+static union drbd_state
+apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+{
+ union drbd_state ns;
+ ns.i = (os.i & ~mask.i) | val.i;
+ return ns;
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_device *device, enum chg_state_flags f,
+ union drbd_state mask, union drbd_state val)
+{
+ unsigned long flags;
+ union drbd_state ns;
+ enum drbd_state_rv rv;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ ns = apply_mask_val(drbd_read_state(device), mask, val);
+ rv = _drbd_set_state(device, ns, f, NULL);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ */
+void drbd_force_state(struct drbd_device *device,
+ union drbd_state mask, union drbd_state val)
+{
+ drbd_change_state(device, CS_HARD, mask, val);
+}
+
+static enum drbd_state_rv
+_req_st_cond(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val)
+{
+ union drbd_state os, ns;
+ unsigned long flags;
+ enum drbd_state_rv rv;
+
+ if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags))
+ return SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags))
+ return SS_CW_FAILED_BY_PEER;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+ rv = is_valid_transition(os, ns);
+ if (rv >= SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+
+ if (!cl_wide_st_chg(device, os, ns))
+ rv = SS_CW_NO_NEED;
+ if (rv == SS_UNKNOWN_ERROR) {
+ rv = is_valid_state(device, ns);
+ if (rv >= SS_SUCCESS) {
+ rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+ if (rv >= SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+ }
+ }
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static enum drbd_state_rv
+drbd_req_state(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ struct completion done;
+ unsigned long flags;
+ union drbd_state os, ns;
+ enum drbd_state_rv rv;
+
+ init_completion(&done);
+
+ if (f & CS_SERIALIZE)
+ mutex_lock(device->state_mutex);
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS) {
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ goto abort;
+ }
+
+ if (cl_wide_st_chg(device, os, ns)) {
+ rv = is_valid_state(device, ns);
+ if (rv == SS_SUCCESS)
+ rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+
+ if (drbd_send_state_req(first_peer_device(device), mask, val)) {
+ rv = SS_CW_FAILED_BY_PEER;
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+
+ wait_event(device->state_wait,
+ (rv = _req_st_cond(device, mask, val)));
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ ns = apply_mask_val(drbd_read_state(device), mask, val);
+ rv = _drbd_set_state(device, ns, f, &done);
+ } else {
+ rv = _drbd_set_state(device, ns, f, &done);
+ }
+
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+ D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+ wait_for_completion(&done);
+ }
+
+abort:
+ if (f & CS_SERIALIZE)
+ mutex_unlock(device->state_mutex);
+
+ return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ enum drbd_state_rv rv;
+
+ wait_event(device->state_wait,
+ (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+ return rv;
+}
+
+enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ enum drbd_state_rv rv;
+
+ BUG_ON(f & CS_SERIALIZE);
+
+ wait_event_cmd(device->state_wait,
+ (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE,
+ mutex_unlock(device->state_mutex),
+ mutex_lock(device->state_mutex));
+
+ return rv;
+}
+
+static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
+{
+ drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
+ name,
+ drbd_conn_str(ns.conn),
+ drbd_role_str(ns.role),
+ drbd_role_str(ns.peer),
+ drbd_disk_str(ns.disk),
+ drbd_disk_str(ns.pdsk),
+ is_susp(ns) ? 's' : 'r',
+ ns.aftr_isp ? 'a' : '-',
+ ns.peer_isp ? 'p' : '-',
+ ns.user_isp ? 'u' : '-',
+ ns.susp_fen ? 'F' : '-',
+ ns.susp_nod ? 'N' : '-'
+ );
+}
+
+void print_st_err(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum drbd_state_rv err)
+{
+ if (err == SS_IN_TRANSIENT_STATE)
+ return;
+ drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err));
+ print_st(device, " state", os);
+ print_st(device, "wanted", ns);
+}
+
+static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char *pbp;
+ pbp = pb;
+ *pbp = 0;
+
+ if (ns.role != os.role && flags & CS_DC_ROLE)
+ pbp += sprintf(pbp, "role( %s -> %s ) ",
+ drbd_role_str(os.role),
+ drbd_role_str(ns.role));
+ if (ns.peer != os.peer && flags & CS_DC_PEER)
+ pbp += sprintf(pbp, "peer( %s -> %s ) ",
+ drbd_role_str(os.peer),
+ drbd_role_str(ns.peer));
+ if (ns.conn != os.conn && flags & CS_DC_CONN)
+ pbp += sprintf(pbp, "conn( %s -> %s ) ",
+ drbd_conn_str(os.conn),
+ drbd_conn_str(ns.conn));
+ if (ns.disk != os.disk && flags & CS_DC_DISK)
+ pbp += sprintf(pbp, "disk( %s -> %s ) ",
+ drbd_disk_str(os.disk),
+ drbd_disk_str(ns.disk));
+ if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
+ pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+ drbd_disk_str(os.pdsk),
+ drbd_disk_str(ns.pdsk));
+
+ return pbp - pb;
+}
+
+static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char pb[300];
+ char *pbp = pb;
+
+ pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+
+ if (ns.aftr_isp != os.aftr_isp)
+ pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+ os.aftr_isp,
+ ns.aftr_isp);
+ if (ns.peer_isp != os.peer_isp)
+ pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+ os.peer_isp,
+ ns.peer_isp);
+ if (ns.user_isp != os.user_isp)
+ pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+ os.user_isp,
+ ns.user_isp);
+
+ if (pbp != pb)
+ drbd_info(device, "%s\n", pb);
+}
+
+static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char pb[300];
+ char *pbp = pb;
+
+ pbp += print_state_change(pbp, os, ns, flags);
+
+ if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
+ pbp += sprintf(pbp, "susp( %d -> %d ) ",
+ is_susp(os),
+ is_susp(ns));
+
+ if (pbp != pb)
+ drbd_info(connection, "%s\n", pb);
+}
+
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @device: DRBD device.
+ * @ns: State to consider.
+ */
+static enum drbd_state_rv
+is_valid_state(struct drbd_device *device, union drbd_state ns)
+{
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ enum drbd_fencing_p fp;
+ enum drbd_state_rv rv = SS_SUCCESS;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ fp = FP_DONT_CARE;
+ if (get_ldev(device)) {
+ fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ put_ldev(device);
+ }
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (nc) {
+ if (!nc->two_primaries && ns.role == R_PRIMARY) {
+ if (ns.peer == R_PRIMARY)
+ rv = SS_TWO_PRIMARIES;
+ else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY)
+ rv = SS_O_VOL_PEER_PRI;
+ }
+ }
+
+ if (rv <= 0)
+ /* already found a reason to abort */;
+ else if (ns.role == R_SECONDARY && device->open_cnt)
+ rv = SS_DEVICE_IN_USE;
+
+ else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (fp >= FP_RESOURCE &&
+ ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+ rv = SS_PRIMARY_NOP;
+
+ else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+ rv = SS_NO_LOCAL_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+ rv = SS_NO_REMOTE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if ((ns.conn == C_CONNECTED ||
+ ns.conn == C_WF_BITMAP_S ||
+ ns.conn == C_SYNC_SOURCE ||
+ ns.conn == C_PAUSED_SYNC_S) &&
+ ns.disk == D_OUTDATED)
+ rv = SS_CONNECTED_OUTDATES;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ (nc->verify_alg[0] == 0))
+ rv = SS_NO_VERIFY_ALG;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ first_peer_device(device)->connection->agreed_pro_version < 88)
+ rv = SS_NOT_SUPPORTED;
+
+ else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ ns.pdsk == D_UNKNOWN)
+ rv = SS_NEED_CONNECTION;
+
+ else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+ rv = SS_CONNECTED_OUTDATES;
+
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
+ * This function limits state transitions that may be declined by DRBD. I.e.
+ * user requests (aka soft transitions).
+ * @device: DRBD device.
+ * @ns: new state.
+ * @os: old state.
+ */
+static enum drbd_state_rv
+is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+
+ if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+ os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+ rv = SS_ALREADY_STANDALONE;
+
+ if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+ rv = SS_NO_NET_CONFIG;
+
+ if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+ rv = SS_LOWER_THAN_OUTDATED;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+ rv = SS_IN_TRANSIENT_STATE;
+
+ /* While establishing a connection only allow cstate to change.
+ Delay/refuse role changes, detach attach etc... (they do not touch cstate) */
+ if (test_bit(STATE_SENT, &connection->flags) &&
+ !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) ||
+ (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
+ rv = SS_IN_TRANSIENT_STATE;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ ns.conn != os.conn && os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+ && os.conn < C_WF_REPORT_PARAMS)
+ rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+ if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+ os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+ rv = SS_OUTDATE_WO_CONN;
+
+ return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+{
+ /* no change -> nothing to do, at least for the connection part */
+ if (oc == nc)
+ return SS_NOTHING_TO_DO;
+
+ /* disconnect of an unconfigured connection does not make sense */
+ if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+ return SS_ALREADY_STANDALONE;
+
+ /* from C_STANDALONE, we start with C_UNCONNECTED */
+ if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+ return SS_NEED_CONNECTION;
+
+ /* When establishing a connection we need to go through WF_REPORT_PARAMS!
+ Necessary to do the right thing upon invalidate-remote on a disconnected resource */
+ if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
+ return SS_NEED_CONNECTION;
+
+ /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+ if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+ return SS_IN_TRANSIENT_STATE;
+
+ /* After C_DISCONNECTING only C_STANDALONE may follow */
+ if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+ return SS_IN_TRANSIENT_STATE;
+
+ return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @ns: new state.
+ * @os: old state.
+ */
+static enum drbd_state_rv
+is_valid_transition(union drbd_state os, union drbd_state ns)
+{
+ enum drbd_state_rv rv;
+
+ rv = is_valid_conn_transition(os.conn, ns.conn);
+
+ /* we cannot fail (again) if we already detached */
+ if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn)
+{
+ static const char *msg_table[] = {
+ [NO_WARNING] = "",
+ [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+ [ABORTED_RESYNC] = "Resync aborted.",
+ [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+ [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+ [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+ };
+
+ if (warn != NO_WARNING)
+ drbd_warn(device, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @device: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum sanitize_state_warnings *warn)
+{
+ enum drbd_fencing_p fp;
+ enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+ if (warn)
+ *warn = NO_WARNING;
+
+ fp = FP_DONT_CARE;
+ if (get_ldev(device)) {
+ rcu_read_lock();
+ fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ rcu_read_unlock();
+ put_ldev(device);
+ }
+
+ /* Implications from connection to peer and peer_isp */
+ if (ns.conn < C_CONNECTED) {
+ ns.peer_isp = 0;
+ ns.peer = R_UNKNOWN;
+ if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+ ns.pdsk = D_UNKNOWN;
+ }
+
+ /* Clear the aftr_isp when becoming unconfigured */
+ if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+ ns.aftr_isp = 0;
+
+ /* An implication of the disk states onto the connection state */
+ /* Abort resync if a disk fails/detaches */
+ if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+ if (warn)
+ *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
+ ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+ ns.conn = C_CONNECTED;
+ }
+
+ /* Connection breaks down before we finished "Negotiating" */
+ if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+ get_ldev_if_state(device, D_NEGOTIATING)) {
+ if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) {
+ ns.disk = device->new_state_tmp.disk;
+ ns.pdsk = device->new_state_tmp.pdsk;
+ } else {
+ if (warn)
+ *warn = CONNECTION_LOST_NEGOTIATING;
+ ns.disk = D_DISKLESS;
+ ns.pdsk = D_UNKNOWN;
+ }
+ put_ldev(device);
+ }
+
+ /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+ if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+ if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+ ns.disk = D_UP_TO_DATE;
+ if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+ ns.pdsk = D_UP_TO_DATE;
+ }
+
+ /* Implications of the connection stat on the disk states */
+ disk_min = D_DISKLESS;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_UNKNOWN;
+ switch ((enum drbd_conns)ns.conn) {
+ case C_WF_BITMAP_T:
+ case C_PAUSED_SYNC_T:
+ case C_STARTING_SYNC_T:
+ case C_WF_SYNC_UUID:
+ case C_BEHIND:
+ disk_min = D_INCONSISTENT;
+ disk_max = D_OUTDATED;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_VERIFY_S:
+ case C_VERIFY_T:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_CONNECTED:
+ disk_min = D_DISKLESS;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_DISKLESS;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_WF_BITMAP_S:
+ case C_PAUSED_SYNC_S:
+ case C_STARTING_SYNC_S:
+ case C_AHEAD:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+ break;
+ case C_SYNC_TARGET:
+ disk_min = D_INCONSISTENT;
+ disk_max = D_INCONSISTENT;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_SYNC_SOURCE:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_INCONSISTENT;
+ break;
+ case C_STANDALONE:
+ case C_DISCONNECTING:
+ case C_UNCONNECTED:
+ case C_TIMEOUT:
+ case C_BROKEN_PIPE:
+ case C_NETWORK_FAILURE:
+ case C_PROTOCOL_ERROR:
+ case C_TEAR_DOWN:
+ case C_WF_CONNECTION:
+ case C_WF_REPORT_PARAMS:
+ case C_MASK:
+ break;
+ }
+ if (ns.disk > disk_max)
+ ns.disk = disk_max;
+
+ if (ns.disk < disk_min) {
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_DISK;
+ ns.disk = disk_min;
+ }
+ if (ns.pdsk > pdsk_max)
+ ns.pdsk = pdsk_max;
+
+ if (ns.pdsk < pdsk_min) {
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_PDSK;
+ ns.pdsk = pdsk_min;
+ }
+
+ if (fp == FP_STONITH &&
+ (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+ !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+ ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+ if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+ !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+ ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+ if (ns.conn == C_SYNC_SOURCE)
+ ns.conn = C_PAUSED_SYNC_S;
+ if (ns.conn == C_SYNC_TARGET)
+ ns.conn = C_PAUSED_SYNC_T;
+ } else {
+ if (ns.conn == C_PAUSED_SYNC_S)
+ ns.conn = C_SYNC_SOURCE;
+ if (ns.conn == C_PAUSED_SYNC_T)
+ ns.conn = C_SYNC_TARGET;
+ }
+
+ return ns;
+}
+
+void drbd_resume_al(struct drbd_device *device)
+{
+ if (test_and_clear_bit(AL_SUSPENDED, &device->flags))
+ drbd_info(device, "Resumed AL updates\n");
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
+{
+ if (first_peer_device(device)->connection->agreed_pro_version < 90)
+ device->ov_start_sector = 0;
+ device->rs_total = drbd_bm_bits(device);
+ device->ov_position = 0;
+ if (cs == C_VERIFY_T) {
+ /* starting online verify from an arbitrary position
+ * does not fit well into the existing protocol.
+ * on C_VERIFY_T, we initialize ov_left and friends
+ * implicitly in receive_DataRequest once the
+ * first P_OV_REQUEST is received */
+ device->ov_start_sector = ~(sector_t)0;
+ } else {
+ unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector);
+ if (bit >= device->rs_total) {
+ device->ov_start_sector =
+ BM_BIT_TO_SECT(device->rs_total - 1);
+ device->rs_total = 1;
+ } else
+ device->rs_total -= bit;
+ device->ov_position = device->ov_start_sector;
+ }
+ device->ov_left = device->rs_total;
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @device: DRBD device.
+ * @ns: new state.
+ * @flags: Flags
+ * @done: Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+enum drbd_state_rv
+__drbd_set_state(struct drbd_device *device, union drbd_state ns,
+ enum chg_state_flags flags, struct completion *done)
+{
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ union drbd_state os;
+ enum drbd_state_rv rv = SS_SUCCESS;
+ enum sanitize_state_warnings ssw;
+ struct after_state_chg_work *ascw;
+
+ os = drbd_read_state(device);
+
+ ns = sanitize_state(device, os, ns, &ssw);
+ if (ns.i == os.i)
+ return SS_NOTHING_TO_DO;
+
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS)
+ return rv;
+
+ if (!(flags & CS_HARD)) {
+ /* pre-state-change checks ; only look at ns */
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ rv = is_valid_state(device, ns);
+ if (rv < SS_SUCCESS) {
+ /* If the old state was illegal as well, then let
+ this happen...*/
+
+ if (is_valid_state(device, os) == rv)
+ rv = is_valid_soft_transition(os, ns, connection);
+ } else
+ rv = is_valid_soft_transition(os, ns, connection);
+ }
+
+ if (rv < SS_SUCCESS) {
+ if (flags & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ return rv;
+ }
+
+ print_sanitize_warnings(device, ssw);
+
+ drbd_pr_state_change(device, os, ns, flags);
+
+ /* Display changes to the susp* flags that where caused by the call to
+ sanitize_state(). Only display it here if we where not called from
+ _conn_request_state() */
+ if (!(flags & CS_DC_SUSP))
+ conn_pr_state_change(connection, os, ns,
+ (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+
+ /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+ * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+ * drbd_ldev_destroy() won't happen before our corresponding
+ * after_state_ch works run, where we put_ldev again. */
+ if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+ (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+ atomic_inc(&device->local_cnt);
+
+ if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
+ clear_bit(RS_DONE, &device->flags);
+
+ /* changes to local_cnt and device flags should be visible before
+ * changes to state, which again should be visible before anything else
+ * depending on that change happens. */
+ smp_wmb();
+ device->state.i = ns.i;
+ device->resource->susp = ns.susp;
+ device->resource->susp_nod = ns.susp_nod;
+ device->resource->susp_fen = ns.susp_fen;
+ smp_wmb();
+
+ /* put replicated vs not-replicated requests in seperate epochs */
+ if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+ drbd_should_do_remote((union drbd_dev_state)ns.i))
+ start_new_tl_epoch(connection);
+
+ if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+ drbd_print_uuids(device, "attached to UUIDs");
+
+ /* Wake up role changes, that were delayed because of connection establishing */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
+ no_peer_wf_report_params(connection)) {
+ clear_bit(STATE_SENT, &connection->flags);
+ wake_up_all_devices(connection);
+ }
+
+ wake_up(&device->misc_wait);
+ wake_up(&device->state_wait);
+ wake_up(&connection->ping_wait);
+
+ /* Aborted verify run, or we reached the stop sector.
+ * Log the last position, unless end-of-device. */
+ if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+ ns.conn <= C_CONNECTED) {
+ device->ov_start_sector =
+ BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left);
+ if (device->ov_left)
+ drbd_info(device, "Online Verify reached sector %llu\n",
+ (unsigned long long)device->ov_start_sector);
+ }
+
+ if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
+ drbd_info(device, "Syncer continues.\n");
+ device->rs_paused += (long)jiffies
+ -(long)device->rs_mark_time[device->rs_last_mark];
+ if (ns.conn == C_SYNC_TARGET)
+ mod_timer(&device->resync_timer, jiffies);
+ }
+
+ if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
+ (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+ drbd_info(device, "Resync suspended\n");
+ device->rs_mark_time[device->rs_last_mark] = jiffies;
+ }
+
+ if (os.conn == C_CONNECTED &&
+ (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+ unsigned long now = jiffies;
+ int i;
+
+ set_ov_position(device, ns.conn);
+ device->rs_start = now;
+ device->rs_last_sect_ev = 0;
+ device->ov_last_oos_size = 0;
+ device->ov_last_oos_start = 0;
+
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = device->ov_left;
+ device->rs_mark_time[i] = now;
+ }
+
+ drbd_rs_controller_reset(device);
+
+ if (ns.conn == C_VERIFY_S) {
+ drbd_info(device, "Starting Online Verify from sector %llu\n",
+ (unsigned long long)device->ov_position);
+ mod_timer(&device->resync_timer, jiffies);
+ }
+ }
+
+ if (get_ldev(device)) {
+ u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+ MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+ MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+ mdf &= ~MDF_AL_CLEAN;
+ if (test_bit(CRASHED_PRIMARY, &device->flags))
+ mdf |= MDF_CRASHED_PRIMARY;
+ if (device->state.role == R_PRIMARY ||
+ (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY))
+ mdf |= MDF_PRIMARY_IND;
+ if (device->state.conn > C_WF_REPORT_PARAMS)
+ mdf |= MDF_CONNECTED_IND;
+ if (device->state.disk > D_INCONSISTENT)
+ mdf |= MDF_CONSISTENT;
+ if (device->state.disk > D_OUTDATED)
+ mdf |= MDF_WAS_UP_TO_DATE;
+ if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT)
+ mdf |= MDF_PEER_OUT_DATED;
+ if (mdf != device->ldev->md.flags) {
+ device->ldev->md.flags = mdf;
+ drbd_md_mark_dirty(device);
+ }
+ if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+ drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]);
+ put_ldev(device);
+ }
+
+ /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+ if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+ os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+ set_bit(CONSIDER_RESYNC, &device->flags);
+
+ /* Receiver should clean up itself */
+ if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+ drbd_thread_stop_nowait(&connection->receiver);
+
+ /* Now the receiver finished cleaning up itself, it should die */
+ if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+ drbd_thread_stop_nowait(&connection->receiver);
+
+ /* Upon network failure, we need to restart the receiver. */
+ if (os.conn > C_WF_CONNECTION &&
+ ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+ drbd_thread_restart_nowait(&connection->receiver);
+
+ /* Resume AL writing if we get a connection */
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+ drbd_resume_al(device);
+ connection->connect_cnt++;
+ }
+
+ /* remember last attach time so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ ns.disk > D_NEGOTIATING)
+ device->last_reattach_jif = jiffies;
+
+ ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+ if (ascw) {
+ ascw->os = os;
+ ascw->ns = ns;
+ ascw->flags = flags;
+ ascw->w.cb = w_after_state_ch;
+ ascw->device = device;
+ ascw->done = done;
+ drbd_queue_work(&connection->sender_work,
+ &ascw->w);
+ } else {
+ drbd_err(device, "Could not kmalloc an ascw\n");
+ }
+
+ return rv;
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused)
+{
+ struct after_state_chg_work *ascw =
+ container_of(w, struct after_state_chg_work, w);
+ struct drbd_device *device = ascw->device;
+
+ after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+ if (ascw->flags & CS_WAIT_COMPLETE)
+ complete(ascw->done);
+ kfree(ascw);
+
+ return 0;
+}
+
+static void abw_start_sync(struct drbd_device *device, int rv)
+{
+ if (rv) {
+ drbd_err(device, "Writing the bitmap failed not starting resync.\n");
+ _drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE);
+ return;
+ }
+
+ switch (device->state.conn) {
+ case C_STARTING_SYNC_T:
+ _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ break;
+ case C_STARTING_SYNC_S:
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ break;
+ }
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags)
+{
+ int rv;
+
+ D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+ /* open coded non-blocking drbd_suspend_io(device); */
+ set_bit(SUSPEND_IO, &device->flags);
+
+ drbd_bm_lock(device, why, flags);
+ rv = io_fn(device);
+ drbd_bm_unlock(device);
+
+ drbd_resume_io(device);
+
+ return rv;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @device: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @flags: Flags
+ */
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags)
+{
+ struct drbd_resource *resource = device->resource;
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ struct sib_info sib;
+
+ sib.sib_reason = SIB_STATE_CHANGE;
+ sib.os = os;
+ sib.ns = ns;
+
+ if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
+ && (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
+ clear_bit(CRASHED_PRIMARY, &device->flags);
+ if (device->p_uuid)
+ device->p_uuid[UI_FLAGS] &= ~((u64)2);
+ }
+
+ /* Inform userspace about the change... */
+ drbd_bcast_event(device, &sib);
+
+ if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+ drbd_khelper(device, "pri-on-incon-degr");
+
+ /* Here we have the actions that are performed after a
+ state change. This function might sleep */
+
+ if (ns.susp_nod) {
+ enum drbd_req_event what = NOTHING;
+
+ spin_lock_irq(&device->resource->req_lock);
+ if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED)
+ what = RESEND;
+
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ conn_lowest_disk(connection) > D_NEGOTIATING)
+ what = RESTART_FROZEN_DISK_IO;
+
+ if (resource->susp_nod && what != NOTHING) {
+ _tl_restart(connection, what);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_nod = 1 } },
+ (union drbd_state) { { .susp_nod = 0 } },
+ CS_VERBOSE);
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+
+ if (ns.susp_fen) {
+ spin_lock_irq(&device->resource->req_lock);
+ if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
+ /* case2: The connection was established again: */
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
+ rcu_read_unlock();
+ _tl_restart(connection, RESEND);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE);
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+
+ /* Became sync source. With protocol >= 96, we still need to send out
+ * the sync uuid now. Need to do that before any drbd_send_state, or
+ * the other side may go "paused sync" before receiving the sync uuids,
+ * which is unexpected. */
+ if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+ connection->agreed_pro_version >= 96 && get_ldev(device)) {
+ drbd_gen_and_send_sync_uuid(peer_device);
+ put_ldev(device);
+ }
+
+ /* Do not change the order of the if above and the two below... */
+ if (os.pdsk == D_DISKLESS &&
+ ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */
+ /* we probably will start a resync soon.
+ * make sure those things are properly reset. */
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+ drbd_rs_cancel_all(device);
+
+ drbd_send_uuids(peer_device);
+ drbd_send_state(peer_device, ns);
+ }
+ /* No point in queuing send_bitmap if we don't have a connection
+ * anymore, so check also the _current_ state, not only the new state
+ * at the time this work was queued. */
+ if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+ device->state.conn == C_WF_BITMAP_S)
+ drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL,
+ "send_bitmap (WFBitMapS)",
+ BM_LOCKED_TEST_ALLOWED);
+
+ /* Lost contact to peer's copy of the data */
+ if ((os.pdsk >= D_INCONSISTENT &&
+ os.pdsk != D_UNKNOWN &&
+ os.pdsk != D_OUTDATED)
+ && (ns.pdsk < D_INCONSISTENT ||
+ ns.pdsk == D_UNKNOWN ||
+ ns.pdsk == D_OUTDATED)) {
+ if (get_ldev(device)) {
+ if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+ device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ if (drbd_suspended(device)) {
+ set_bit(NEW_CUR_UUID, &device->flags);
+ } else {
+ drbd_uuid_new_current(device);
+ drbd_send_uuids(peer_device);
+ }
+ }
+ put_ldev(device);
+ }
+ }
+
+ if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
+ if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+ device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ drbd_uuid_new_current(device);
+ drbd_send_uuids(peer_device);
+ }
+ /* D_DISKLESS Peer becomes secondary */
+ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+ /* We may still be Primary ourselves.
+ * No harm done if the bitmap still changes,
+ * redirtied pages will follow later. */
+ drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+ "demote diskless peer", BM_LOCKED_SET_ALLOWED);
+ put_ldev(device);
+ }
+
+ /* Write out all changed bits on demote.
+ * Though, no need to da that just yet
+ * if there is a resync going on still */
+ if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+ device->state.conn <= C_CONNECTED && get_ldev(device)) {
+ /* No changes to the bitmap expected this time, so assert that,
+ * even though no harm was done if it did change. */
+ drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+ "demote", BM_LOCKED_TEST_ALLOWED);
+ put_ldev(device);
+ }
+
+ /* Last part of the attaching process ... */
+ if (ns.conn >= C_CONNECTED &&
+ os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+ drbd_send_sizes(peer_device, 0, 0); /* to start sync... */
+ drbd_send_uuids(peer_device);
+ drbd_send_state(peer_device, ns);
+ }
+
+ /* We want to pause/continue resync, tell peer. */
+ if (ns.conn >= C_CONNECTED &&
+ ((os.aftr_isp != ns.aftr_isp) ||
+ (os.user_isp != ns.user_isp)))
+ drbd_send_state(peer_device, ns);
+
+ /* In case one of the isp bits got set, suspend other devices. */
+ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+ (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+ suspend_other_sg(device);
+
+ /* Make sure the peer gets informed about eventual state
+ changes (ISP bits) while we were in WFReportParams. */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+
+ if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+ drbd_send_state(peer_device, ns);
+
+ /* We are in the progress to start a full sync... */
+ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+ /* no other bitmap changes expected during this phase */
+ drbd_queue_bitmap_io(device,
+ &drbd_bmio_set_n_write, &abw_start_sync,
+ "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+ /* first half of local IO error, failure to attach,
+ * or administrative detach */
+ if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+ enum drbd_io_error_p eh = EP_PASS_ON;
+ int was_io_error = 0;
+ /* corresponding get_ldev was in __drbd_set_state, to serialize
+ * our cleanup here with the transition to D_DISKLESS.
+ * But is is still not save to dreference ldev here, since
+ * we might come from an failed Attach before ldev was set. */
+ if (device->ldev) {
+ rcu_read_lock();
+ eh = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+
+ was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
+
+ if (was_io_error && eh == EP_CALL_HELPER)
+ drbd_khelper(device, "local-io-error");
+
+ /* Immediately allow completion of all application IO,
+ * that waits for completion from the local disk,
+ * if this was a force-detach due to disk_timeout
+ * or administrator request (drbdsetup detach --force).
+ * Do NOT abort otherwise.
+ * Aborting local requests may cause serious problems,
+ * if requests are completed to upper layers already,
+ * and then later the already submitted local bio completes.
+ * This can cause DMA into former bio pages that meanwhile
+ * have been re-used for other things.
+ * So aborting local requests may cause crashes,
+ * or even worse, silent data corruption.
+ */
+ if (test_and_clear_bit(FORCE_DETACH, &device->flags))
+ tl_abort_disk_io(device);
+
+ /* current state still has to be D_FAILED,
+ * there is only one way out: to D_DISKLESS,
+ * and that may only happen after our put_ldev below. */
+ if (device->state.disk != D_FAILED)
+ drbd_err(device,
+ "ASSERT FAILED: disk is %s during detach\n",
+ drbd_disk_str(device->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+
+ drbd_rs_cancel_all(device);
+
+ /* In case we want to get something to stable storage still,
+ * this may be the last chance.
+ * Following put_ldev may transition to D_DISKLESS. */
+ drbd_md_sync(device);
+ }
+ put_ldev(device);
+ }
+
+ /* second half of local IO error, failure to attach,
+ * or administrative detach,
+ * after local_cnt references have reached zero again */
+ if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+ /* We must still be diskless,
+ * re-attach has to be serialized with this! */
+ if (device->state.disk != D_DISKLESS)
+ drbd_err(device,
+ "ASSERT FAILED: disk is %s while going diskless\n",
+ drbd_disk_str(device->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+ /* corresponding get_ldev in __drbd_set_state
+ * this may finally trigger drbd_ldev_destroy. */
+ put_ldev(device);
+ }
+
+ /* Notify peer that I had a local IO error, and did not detached.. */
+ if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+
+ /* Disks got bigger while they were detached */
+ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+ test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) {
+ if (ns.conn == C_CONNECTED)
+ resync_after_online_grow(device);
+ }
+
+ /* A resync finished or aborted, wake paused devices... */
+ if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+ (os.peer_isp && !ns.peer_isp) ||
+ (os.user_isp && !ns.user_isp))
+ resume_next_sg(device);
+
+ /* sync target done with resync. Explicitly notify peer, even though
+ * it should (at least for non-empty resyncs) already know itself. */
+ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+
+ /* Verify finished, or reached stop sector. Peer did not know about
+ * the stop sector, and we may even have changed the stop sector during
+ * verify to interrupt/stop early. Send the new state. */
+ if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
+ && verify_can_do_stop_sector(device))
+ drbd_send_state(peer_device, ns);
+
+ /* This triggers bitmap writeout of potentially still unwritten pages
+ * if the resync finished cleanly, or aborted because of peer disk
+ * failure, or because of connection loss.
+ * For resync aborted because of local disk failure, we cannot do
+ * any bitmap writeout anymore.
+ * No harm done if some bits change during this phase.
+ */
+ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) {
+ drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
+ "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+ put_ldev(device);
+ }
+
+ if (ns.disk == D_DISKLESS &&
+ ns.conn == C_STANDALONE &&
+ ns.role == R_SECONDARY) {
+ if (os.aftr_isp != ns.aftr_isp)
+ resume_next_sg(device);
+ }
+
+ drbd_md_sync(device);
+}
+
+struct after_conn_state_chg_work {
+ struct drbd_work w;
+ enum drbd_conns oc;
+ union drbd_state ns_min;
+ union drbd_state ns_max; /* new, max state, over all devices */
+ enum chg_state_flags flags;
+ struct drbd_connection *connection;
+};
+
+static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+{
+ struct after_conn_state_chg_work *acscw =
+ container_of(w, struct after_conn_state_chg_work, w);
+ struct drbd_connection *connection = acscw->connection;
+ enum drbd_conns oc = acscw->oc;
+ union drbd_state ns_max = acscw->ns_max;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ kfree(acscw);
+
+ /* Upon network configuration, we need to start the receiver */
+ if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
+ drbd_thread_start(&connection->receiver);
+
+ if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
+ struct net_conf *old_conf;
+
+ mutex_lock(&connection->resource->conf_update);
+ old_conf = connection->net_conf;
+ connection->my_addr_len = 0;
+ connection->peer_addr_len = 0;
+ RCU_INIT_POINTER(connection->net_conf, NULL);
+ conn_free_crypto(connection);
+ mutex_unlock(&connection->resource->conf_update);
+
+ synchronize_rcu();
+ kfree(old_conf);
+ }
+
+ if (ns_max.susp_fen) {
+ /* case1: The outdate peer handler is successful: */
+ if (ns_max.pdsk <= D_OUTDATED) {
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (test_bit(NEW_CUR_UUID, &device->flags)) {
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ }
+ }
+ rcu_read_unlock();
+ spin_lock_irq(&connection->resource->req_lock);
+ _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE);
+ spin_unlock_irq(&connection->resource->req_lock);
+ }
+ }
+ kref_put(&connection->kref, drbd_destroy_connection);
+
+ conn_md_sync(connection);
+
+ return 0;
+}
+
+static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+{
+ enum chg_state_flags flags = ~0;
+ struct drbd_peer_device *peer_device;
+ int vnr, first_vol = 1;
+ union drbd_dev_state os, cs = {
+ { .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = connection->cstate,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN,
+ } };
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ os = device->state;
+
+ if (first_vol) {
+ cs = os;
+ first_vol = 0;
+ continue;
+ }
+
+ if (cs.role != os.role)
+ flags &= ~CS_DC_ROLE;
+
+ if (cs.peer != os.peer)
+ flags &= ~CS_DC_PEER;
+
+ if (cs.conn != os.conn)
+ flags &= ~CS_DC_CONN;
+
+ if (cs.disk != os.disk)
+ flags &= ~CS_DC_DISK;
+
+ if (cs.pdsk != os.pdsk)
+ flags &= ~CS_DC_PDSK;
+ }
+ rcu_read_unlock();
+
+ *pf |= CS_DC_MASK;
+ *pf &= flags;
+ (*pcs).i = cs.i;
+}
+
+static enum drbd_state_rv
+conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+ union drbd_state ns, os;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+
+ if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+ ns.disk = os.disk;
+
+ if (ns.i == os.i)
+ continue;
+
+ rv = is_valid_transition(os, ns);
+
+ if (rv >= SS_SUCCESS && !(flags & CS_HARD)) {
+ rv = is_valid_state(device, ns);
+ if (rv < SS_SUCCESS) {
+ if (is_valid_state(device, os) == rv)
+ rv = is_valid_soft_transition(os, ns, connection);
+ } else
+ rv = is_valid_soft_transition(os, ns, connection);
+ }
+
+ if (rv < SS_SUCCESS) {
+ if (flags & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static void
+conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+{
+ union drbd_state ns, os, ns_max = { };
+ union drbd_state ns_min = {
+ { .role = R_MASK,
+ .peer = R_MASK,
+ .conn = val.conn,
+ .disk = D_MASK,
+ .pdsk = D_MASK
+ } };
+ struct drbd_peer_device *peer_device;
+ enum drbd_state_rv rv;
+ int vnr, number_of_volumes = 0;
+
+ if (mask.conn == C_MASK) {
+ /* remember last connect time so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
+ connection->last_reconnect_jif = jiffies;
+
+ connection->cstate = val.conn;
+ }
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ number_of_volumes++;
+ os = drbd_read_state(device);
+ ns = apply_mask_val(os, mask, val);
+ ns = sanitize_state(device, os, ns, NULL);
+
+ if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+ ns.disk = os.disk;
+
+ rv = __drbd_set_state(device, ns, flags, NULL);
+ if (rv < SS_SUCCESS)
+ BUG();
+
+ ns.i = device->state.i;
+ ns_max.role = max_role(ns.role, ns_max.role);
+ ns_max.peer = max_role(ns.peer, ns_max.peer);
+ ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
+ ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
+ ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
+
+ ns_min.role = min_role(ns.role, ns_min.role);
+ ns_min.peer = min_role(ns.peer, ns_min.peer);
+ ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
+ ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
+ ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+ }
+ rcu_read_unlock();
+
+ if (number_of_volumes == 0) {
+ ns_min = ns_max = (union drbd_state) { {
+ .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = val.conn,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN
+ } };
+ }
+
+ ns_min.susp = ns_max.susp = connection->resource->susp;
+ ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod;
+ ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen;
+
+ *pns_min = ns_min;
+ *pns_max = ns_max;
+}
+
+static enum drbd_state_rv
+_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
+
+ if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
+ rv = SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
+ rv = SS_CW_FAILED_BY_PEER;
+
+ err = conn_is_valid_transition(connection, mask, val, 0);
+ if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
+ return rv;
+
+ return err;
+}
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+ struct after_conn_state_chg_work *acscw;
+ enum drbd_conns oc = connection->cstate;
+ union drbd_state ns_max, ns_min, os;
+ bool have_mutex = false;
+
+ if (mask.conn) {
+ rv = is_valid_conn_transition(oc, val.conn);
+ if (rv < SS_SUCCESS)
+ goto abort;
+ }
+
+ rv = conn_is_valid_transition(connection, mask, val, flags);
+ if (rv < SS_SUCCESS)
+ goto abort;
+
+ if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
+ !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
+
+ /* This will be a cluster-wide state change.
+ * Need to give up the spinlock, grab the mutex,
+ * then send the state change request, ... */
+ spin_unlock_irq(&connection->resource->req_lock);
+ mutex_lock(&connection->cstate_mutex);
+ have_mutex = true;
+
+ set_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ if (conn_send_state_req(connection, mask, val)) {
+ /* sending failed. */
+ clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ rv = SS_CW_FAILED_BY_PEER;
+ /* need to re-aquire the spin lock, though */
+ goto abort_unlocked;
+ }
+
+ if (val.conn == C_DISCONNECTING)
+ set_bit(DISCONNECT_SENT, &connection->flags);
+
+ /* ... and re-aquire the spinlock.
+ * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
+ * conn_set_state() within the same spinlock. */
+ spin_lock_irq(&connection->resource->req_lock);
+ wait_event_lock_irq(connection->ping_wait,
+ (rv = _conn_rq_cond(connection, mask, val)),
+ connection->resource->req_lock);
+ clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ if (rv < SS_SUCCESS)
+ goto abort;
+ }
+
+ conn_old_common_state(connection, &os, &flags);
+ flags |= CS_DC_SUSP;
+ conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
+ conn_pr_state_change(connection, os, ns_max, flags);
+
+ acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
+ if (acscw) {
+ acscw->oc = os.conn;
+ acscw->ns_min = ns_min;
+ acscw->ns_max = ns_max;
+ acscw->flags = flags;
+ acscw->w.cb = w_after_conn_state_ch;
+ kref_get(&connection->kref);
+ acscw->connection = connection;
+ drbd_queue_work(&connection->sender_work, &acscw->w);
+ } else {
+ drbd_err(connection, "Could not kmalloc an acscw\n");
+ }
+
+ abort:
+ if (have_mutex) {
+ /* mutex_unlock() "... must not be used in interrupt context.",
+ * so give up the spinlock, then re-aquire it */
+ spin_unlock_irq(&connection->resource->req_lock);
+ abort_unlocked:
+ mutex_unlock(&connection->cstate_mutex);
+ spin_lock_irq(&connection->resource->req_lock);
+ }
+ if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
+ drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv));
+ drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
+ drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
+ }
+ return rv;
+}
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv;
+
+ spin_lock_irq(&connection->resource->req_lock);
+ rv = _conn_request_state(connection, mask, val, flags);
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ return rv;
+}
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
new file mode 100644
index 000000000..7f53c4082
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.h
@@ -0,0 +1,166 @@
+#ifndef DRBD_STATE_H
+#define DRBD_STATE_H
+
+struct drbd_device;
+struct drbd_connection;
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+#define NS(T, S) \
+ ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+enum chg_state_flags {
+ CS_HARD = 1 << 0,
+ CS_VERBOSE = 1 << 1,
+ CS_WAIT_COMPLETE = 1 << 2,
+ CS_SERIALIZE = 1 << 3,
+ CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
+ CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
+ CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
+ CS_DC_PEER = 1 << 6,
+ CS_DC_CONN = 1 << 7,
+ CS_DC_DISK = 1 << 8,
+ CS_DC_PDSK = 1 << 9,
+ CS_DC_SUSP = 1 << 10,
+ CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
+ CS_IGN_OUTD_FAIL = 1 << 11,
+};
+
+/* drbd_dev_state and drbd_state are different types. This is to stress the
+ small difference. There is no suspended flag (.susp), and no suspended
+ while fence handler runs flas (susp_fen). */
+union drbd_dev_state {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned _unused:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned peer_isp:1 ;
+ unsigned user_isp:1 ;
+ unsigned _pad:11; /* 0 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned _pad:11;
+ unsigned user_isp:1 ;
+ unsigned peer_isp:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned _unused:1 ;
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+ };
+ unsigned int i;
+};
+
+extern enum drbd_state_rv drbd_change_state(struct drbd_device *device,
+ enum chg_state_flags f,
+ union drbd_state mask,
+ union drbd_state val);
+extern void drbd_force_state(struct drbd_device *, union drbd_state,
+ union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
+ union drbd_state,
+ union drbd_state,
+ enum chg_state_flags);
+
+extern enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
+ union drbd_state, enum chg_state_flags);
+
+extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
+ enum chg_state_flags,
+ struct completion *done);
+extern void print_st_err(struct drbd_device *, union drbd_state,
+ union drbd_state, int);
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags);
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags);
+
+extern void drbd_resume_al(struct drbd_device *device);
+extern bool conn_all_vols_unconf(struct drbd_connection *connection);
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_device *device,
+ union drbd_state mask,
+ union drbd_state val)
+{
+ return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection);
+enum drbd_role conn_highest_peer(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection);
+
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000..80b0f63c7
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,118 @@
+/*
+ drbd.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+
+static const char *drbd_conn_s_names[] = {
+ [C_STANDALONE] = "StandAlone",
+ [C_DISCONNECTING] = "Disconnecting",
+ [C_UNCONNECTED] = "Unconnected",
+ [C_TIMEOUT] = "Timeout",
+ [C_BROKEN_PIPE] = "BrokenPipe",
+ [C_NETWORK_FAILURE] = "NetworkFailure",
+ [C_PROTOCOL_ERROR] = "ProtocolError",
+ [C_WF_CONNECTION] = "WFConnection",
+ [C_WF_REPORT_PARAMS] = "WFReportParams",
+ [C_TEAR_DOWN] = "TearDown",
+ [C_CONNECTED] = "Connected",
+ [C_STARTING_SYNC_S] = "StartingSyncS",
+ [C_STARTING_SYNC_T] = "StartingSyncT",
+ [C_WF_BITMAP_S] = "WFBitMapS",
+ [C_WF_BITMAP_T] = "WFBitMapT",
+ [C_WF_SYNC_UUID] = "WFSyncUUID",
+ [C_SYNC_SOURCE] = "SyncSource",
+ [C_SYNC_TARGET] = "SyncTarget",
+ [C_PAUSED_SYNC_S] = "PausedSyncS",
+ [C_PAUSED_SYNC_T] = "PausedSyncT",
+ [C_VERIFY_S] = "VerifyS",
+ [C_VERIFY_T] = "VerifyT",
+ [C_AHEAD] = "Ahead",
+ [C_BEHIND] = "Behind",
+};
+
+static const char *drbd_role_s_names[] = {
+ [R_PRIMARY] = "Primary",
+ [R_SECONDARY] = "Secondary",
+ [R_UNKNOWN] = "Unknown"
+};
+
+static const char *drbd_disk_s_names[] = {
+ [D_DISKLESS] = "Diskless",
+ [D_ATTACHING] = "Attaching",
+ [D_FAILED] = "Failed",
+ [D_NEGOTIATING] = "Negotiating",
+ [D_INCONSISTENT] = "Inconsistent",
+ [D_OUTDATED] = "Outdated",
+ [D_UNKNOWN] = "DUnknown",
+ [D_CONSISTENT] = "Consistent",
+ [D_UP_TO_DATE] = "UpToDate",
+};
+
+static const char *drbd_state_sw_errors[] = {
+ [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+ [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
+ [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+ [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+ [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+ [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+ [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+ [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+ [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+ [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+ [-SS_DEVICE_IN_USE] = "Device is held open by someone",
+ [-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+ [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+ [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+ [-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+ [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+ [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+ [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+ [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
+ [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+ /* enums are unsigned... */
+ return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+ return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+ return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_rv err)
+{
+ return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+ err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+ : drbd_state_sw_errors[-err];
+}
diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h
new file mode 100644
index 000000000..f9923cc88
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.h
@@ -0,0 +1,9 @@
+#ifndef __DRBD_STRINGS_H
+#define __DRBD_STRINGS_H
+
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_rv);
+
+#endif /* __DRBD_STRINGS_H */
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000..8cb1532a3
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
+/*
+-*- linux-c -*-
+ drbd_receiver.c
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transferred bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ * * simple byte-based
+ * * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total). The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix data bits max val Nº data bits
+0 x 0x2 1
+10 x 0x4 1
+110 xx 0x8 2
+1110 xxx 0x10 3
+11110 xxx xx 0x30 5
+111110 xx xxxxxx 0x130 8
+11111100 xxxxxxxx xxxxx 0x2130 13
+11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
+11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
+11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted x 0.29
+ as plaintext x ........................
+ x ........................
+ x ........................
+ x 0.59 0.21........................
+ x ........................................................
+ x .. c ...................................................
+ x 0.44.. o ...................................................
+ x .......... d ...................................................
+ x .......... e ...................................................
+ X............. ...................................................
+ x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ........................... plain bits ..........
+-+-----------------------------------------------------------------------
+ 1 16 32 64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+ LEVEL( 2, 1, 0x00); \
+ LEVEL( 3, 2, 0x01); \
+ LEVEL( 5, 3, 0x03); \
+ LEVEL( 7, 4, 0x07); \
+ LEVEL(10, 5, 0x0f); \
+ LEVEL(14, 6, 0x1f); \
+ LEVEL(21, 8, 0x3f); \
+ LEVEL(29, 8, 0x7f); \
+ LEVEL(42, 8, 0xbf); \
+ LEVEL(64, 8, 0xff); \
+ } while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+ u64 adj = 1;
+
+#define LEVEL(t,b,v) \
+ do { \
+ if ((in & ((1 << b) -1)) == v) { \
+ *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
+ return t; \
+ } \
+ adj += 1ULL << (t - b); \
+ } while (0)
+
+ VLI_L_1_1();
+
+ /* NOT REACHED, if VLI_LEVELS code table is defined properly */
+ BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+ u64 max = 0;
+ u64 adj = 1;
+
+ if (in == 0)
+ return -EINVAL;
+
+#define LEVEL(t,b,v) do { \
+ max += 1ULL << (t - b); \
+ if (in <= max) { \
+ if (out) \
+ *out = ((in - adj) << b) | v; \
+ return t; \
+ } \
+ adj = max + 1; \
+ } while (0)
+
+ VLI_L_1_1();
+
+ return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+ /* the current byte */
+ u8 *b;
+ /* the current bit within *b, nomalized: 0..7 */
+ unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+ cur->b = s;
+ cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+ bits += cur->bit;
+ cur->b = cur->b + (bits >> 3);
+ cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+ struct bitstream_cursor cur;
+ unsigned char *buf;
+ size_t buf_len; /* in bytes */
+
+ /* for input stream:
+ * number of trailing 0 bits for padding
+ * total number of valid bits in stream: buf_len * 8 - pad_bits */
+ unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+ bs->buf = s;
+ bs->buf_len = len;
+ bs->pad_bits = pad_bits;
+ bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+ bitstream_cursor_reset(&bs->cur, bs->buf);
+ memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+ unsigned char *b = bs->cur.b;
+ unsigned int tmp;
+
+ if (bits == 0)
+ return 0;
+
+ if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+ return -ENOBUFS;
+
+ /* paranoia: strip off hi bits; they should not be set anyways. */
+ if (bits < 64)
+ val &= ~0ULL >> (64 - bits);
+
+ *b++ |= (val & 0xff) << bs->cur.bit;
+
+ for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+ *b++ |= (val >> tmp) & 0xff;
+
+ bitstream_cursor_advance(&bs->cur, bits);
+ return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+ u64 val;
+ unsigned int n;
+
+ if (bits > 64)
+ return -EINVAL;
+
+ if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+ bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+ - bs->cur.bit - bs->pad_bits;
+
+ if (bits == 0) {
+ *out = 0;
+ return 0;
+ }
+
+ /* get the high bits */
+ val = 0;
+ n = (bs->cur.bit + bits + 7) >> 3;
+ /* n may be at most 9, if cur.bit + bits > 64 */
+ /* which means this copies at most 8 byte */
+ if (n) {
+ memcpy(&val, bs->cur.b+1, n - 1);
+ val = le64_to_cpu(val) << (8 - bs->cur.bit);
+ }
+
+ /* we still need the low bits */
+ val |= bs->cur.b[0] >> bs->cur.bit;
+
+ /* and mask out bits we don't want */
+ val &= ~0ULL >> (64 - bits);
+
+ bitstream_cursor_advance(&bs->cur, bits);
+ *out = val;
+
+ return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ * > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+ u64 code = code;
+ int bits = __vli_encode_bits(&code, in);
+
+ if (bits <= 0)
+ return bits;
+
+ return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000..d0fae55d8
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,2156 @@
+/*
+ drbd_worker.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+static int make_ov_request(struct drbd_device *, int);
+static int make_resync_request(struct drbd_device *, int);
+
+/* endio handlers:
+ * drbd_md_endio (defined here)
+ * drbd_request_endio (defined here)
+ * drbd_peer_request_endio (defined here)
+ * drbd_bm_endio (defined in drbd_bitmap.c)
+ *
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+ Each state transition on an device holds a read lock. In case we have
+ to evaluate the resync after dependencies, we grab a write lock, because
+ we need stable states on all devices for that. */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_endio(struct bio *bio, int error)
+{
+ struct drbd_device *device;
+
+ device = bio->bi_private;
+ device->md_io.error = error;
+
+ /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+ * to timeout on the lower level device, and eventually detach from it.
+ * If this io completion runs after that timeout expired, this
+ * drbd_md_put_buffer() may allow us to finally try and re-attach.
+ * During normal operation, this only puts that extra reference
+ * down to 1 again.
+ * Make sure we first drop the reference, and only then signal
+ * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+ * next drbd_md_sync_page_io(), that we trigger the
+ * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
+ */
+ drbd_md_put_buffer(device);
+ device->md_io.done = 1;
+ wake_up(&device->misc_wait);
+ bio_put(bio);
+ if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+ put_ldev(device);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+ unsigned long flags = 0;
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ device->read_cnt += peer_req->i.size >> 9;
+ list_del(&peer_req->w.list);
+ if (list_empty(&device->read_ee))
+ wake_up(&device->ee_wait);
+ if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+ __drbd_chk_io_error(device, DRBD_READ_ERROR);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
+ put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver, final stage. */
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+ unsigned long flags = 0;
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_interval i;
+ int do_wake;
+ u64 block_id;
+ int do_al_complete_io;
+
+ /* after we moved peer_req to done_ee,
+ * we may no longer access it,
+ * it may be freed/reused already!
+ * (as soon as we release the req_lock) */
+ i = peer_req->i;
+ do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+ block_id = peer_req->block_id;
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ device->writ_cnt += peer_req->i.size >> 9;
+ list_move_tail(&peer_req->w.list, &device->done_ee);
+
+ /*
+ * Do not remove from the write_requests tree here: we did not send the
+ * Ack yet and did not wake possibly waiting conflicting requests.
+ * Removed from the tree from "drbd_process_done_ee" within the
+ * appropriate dw.cb (e_end_block/e_end_resync_block) or from
+ * _drbd_clear_done_ee.
+ */
+
+ do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
+
+ /* FIXME do we want to detach for failed REQ_DISCARD?
+ * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+ if (peer_req->flags & EE_WAS_ERROR)
+ __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (block_id == ID_SYNCER)
+ drbd_rs_complete_io(device, i.sector);
+
+ if (do_wake)
+ wake_up(&device->ee_wait);
+
+ if (do_al_complete_io)
+ drbd_al_complete_io(device, &i);
+
+ wake_asender(peer_device->connection);
+ put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_peer_request_endio(struct bio *bio, int error)
+{
+ struct drbd_peer_request *peer_req = bio->bi_private;
+ struct drbd_device *device = peer_req->peer_device->device;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+ int is_write = bio_data_dir(bio) == WRITE;
+ int is_discard = !!(bio->bi_rw & REQ_DISCARD);
+
+ if (error && __ratelimit(&drbd_ratelimit_state))
+ drbd_warn(device, "%s: error=%d s=%llus\n",
+ is_write ? (is_discard ? "discard" : "write")
+ : "read", error,
+ (unsigned long long)peer_req->i.sector);
+ if (!error && !uptodate) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
+ is_write ? "write" : "read",
+ (unsigned long long)peer_req->i.sector);
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ error = -EIO;
+ }
+
+ if (error)
+ set_bit(__EE_WAS_ERROR, &peer_req->flags);
+
+ bio_put(bio); /* no need for the bio anymore */
+ if (atomic_dec_and_test(&peer_req->pending_bios)) {
+ if (is_write)
+ drbd_endio_write_sec_final(peer_req);
+ else
+ drbd_endio_read_sec_final(peer_req);
+ }
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_request_endio(struct bio *bio, int error)
+{
+ unsigned long flags;
+ struct drbd_request *req = bio->bi_private;
+ struct drbd_device *device = req->device;
+ struct bio_and_error m;
+ enum drbd_req_event what;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+ if (!error && !uptodate) {
+ drbd_warn(device, "p %s: setting error to -EIO\n",
+ bio_data_dir(bio) == WRITE ? "write" : "read");
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ error = -EIO;
+ }
+
+
+ /* If this request was aborted locally before,
+ * but now was completed "successfully",
+ * chances are that this caused arbitrary data corruption.
+ *
+ * "aborting" requests, or force-detaching the disk, is intended for
+ * completely blocked/hung local backing devices which do no longer
+ * complete requests at all, not even do error completions. In this
+ * situation, usually a hard-reset and failover is the only way out.
+ *
+ * By "aborting", basically faking a local error-completion,
+ * we allow for a more graceful swichover by cleanly migrating services.
+ * Still the affected node has to be rebooted "soon".
+ *
+ * By completing these requests, we allow the upper layers to re-use
+ * the associated data pages.
+ *
+ * If later the local backing device "recovers", and now DMAs some data
+ * from disk into the original request pages, in the best case it will
+ * just put random data into unused pages; but typically it will corrupt
+ * meanwhile completely unrelated data, causing all sorts of damage.
+ *
+ * Which means delayed successful completion,
+ * especially for READ requests,
+ * is a reason to panic().
+ *
+ * We assume that a delayed *error* completion is OK,
+ * though we still will complain noisily about it.
+ */
+ if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+ if (!error)
+ panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+ }
+
+ /* to avoid recursion in __req_mod */
+ if (unlikely(error)) {
+ if (bio->bi_rw & REQ_DISCARD)
+ what = (error == -EOPNOTSUPP)
+ ? DISCARD_COMPLETED_NOTSUPP
+ : DISCARD_COMPLETED_WITH_ERROR;
+ else
+ what = (bio_data_dir(bio) == WRITE)
+ ? WRITE_COMPLETED_WITH_ERROR
+ : (bio_rw(bio) == READ)
+ ? READ_COMPLETED_WITH_ERROR
+ : READ_AHEAD_COMPLETED_WITH_ERROR;
+ } else
+ what = COMPLETED_OK;
+
+ bio_put(req->private_bio);
+ req->private_bio = ERR_PTR(error);
+
+ /* not req_mod(), we need irqsave here! */
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ __req_mod(req, what, &m);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ put_ldev(device);
+
+ if (m.bio)
+ complete_master_bio(device, &m);
+}
+
+void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
+{
+ struct hash_desc desc;
+ struct scatterlist sg;
+ struct page *page = peer_req->pages;
+ struct page *tmp;
+ unsigned len;
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ sg_init_table(&sg, 1);
+ crypto_hash_init(&desc);
+
+ while ((tmp = page_chain_next(page))) {
+ /* all but the last page will be fully used */
+ sg_set_page(&sg, page, PAGE_SIZE, 0);
+ crypto_hash_update(&desc, &sg, sg.length);
+ page = tmp;
+ }
+ /* and now the last, possibly only partially used page */
+ len = peer_req->i.size & (PAGE_SIZE - 1);
+ sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+ crypto_hash_update(&desc, &sg, sg.length);
+ crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+ struct hash_desc desc;
+ struct scatterlist sg;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ sg_init_table(&sg, 1);
+ crypto_hash_init(&desc);
+
+ bio_for_each_segment(bvec, bio, iter) {
+ sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
+ crypto_hash_update(&desc, &sg, sg.length);
+ }
+ crypto_hash_final(&desc, digest);
+}
+
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ int digest_size;
+ void *digest;
+ int err = 0;
+
+ if (unlikely(cancel))
+ goto out;
+
+ if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
+ goto out;
+
+ digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ if (digest) {
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+ /* Free peer_req and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
+ peer_req = NULL;
+ inc_rs_pending(device);
+ err = drbd_send_drequest_csum(peer_device, sector, size,
+ digest, digest_size,
+ P_CSUM_RS_REQUEST);
+ kfree(digest);
+ } else {
+ drbd_err(device, "kmalloc() of digest failed.\n");
+ err = -ENOMEM;
+ }
+
+out:
+ if (peer_req)
+ drbd_free_peer_req(device, peer_req);
+
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
+ return err;
+}
+
+#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
+
+ if (!get_ldev(device))
+ return -EIO;
+
+ /* GFP_TRY, because if there is no memory available right now, this may
+ * be rescheduled for later. It is "only" background resync, after all. */
+ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
+ size, true /* has real payload */, GFP_TRY);
+ if (!peer_req)
+ goto defer;
+
+ peer_req->w.cb = w_e_send_csum;
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ atomic_add(size >> 9, &device->rs_sect_ev);
+ if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
+ return 0;
+
+ /* If it failed because of ENOMEM, retry should help. If it failed
+ * because bio_add_page failed (probably broken lower level driver),
+ * retry may or may not help.
+ * If it does not, you may need to force disconnect. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ drbd_free_peer_req(device, peer_req);
+defer:
+ put_ldev(device);
+ return -EAGAIN;
+}
+
+int w_resync_timer(struct drbd_work *w, int cancel)
+{
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, resync_work);
+
+ switch (device->state.conn) {
+ case C_VERIFY_S:
+ make_ov_request(device, cancel);
+ break;
+ case C_SYNC_TARGET:
+ make_resync_request(device, cancel);
+ break;
+ }
+
+ return 0;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+
+ drbd_queue_work_if_unqueued(
+ &first_peer_device(device)->connection->sender_work,
+ &device->resync_work);
+}
+
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+ int ov;
+
+ ov = fb->values[fb->head_index];
+ fb->values[fb->head_index++] = value;
+
+ if (fb->head_index >= fb->size)
+ fb->head_index = 0;
+
+ return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] += value;
+}
+
+struct fifo_buffer *fifo_alloc(int fifo_size)
+{
+ struct fifo_buffer *fb;
+
+ fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+ if (!fb)
+ return NULL;
+
+ fb->head_index = 0;
+ fb->size = fifo_size;
+ fb->total = 0;
+
+ return fb;
+}
+
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
+{
+ struct disk_conf *dc;
+ unsigned int want; /* The number of sectors we want in-flight */
+ int req_sect; /* Number of sectors to request in this turn */
+ int correction; /* Number of sectors more we need in-flight */
+ int cps; /* correction per invocation of drbd_rs_controller() */
+ int steps; /* Number of time steps to plan ahead */
+ int curr_corr;
+ int max_sect;
+ struct fifo_buffer *plan;
+
+ dc = rcu_dereference(device->ldev->disk_conf);
+ plan = rcu_dereference(device->rs_plan_s);
+
+ steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+ if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
+ want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
+ } else { /* normal path */
+ want = dc->c_fill_target ? dc->c_fill_target :
+ sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
+ }
+
+ correction = want - device->rs_in_flight - plan->total;
+
+ /* Plan ahead */
+ cps = correction / steps;
+ fifo_add_val(plan, cps);
+ plan->total += cps * steps;
+
+ /* What we do in this step */
+ curr_corr = fifo_push(plan, 0);
+ plan->total -= curr_corr;
+
+ req_sect = sect_in + curr_corr;
+ if (req_sect < 0)
+ req_sect = 0;
+
+ max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
+ if (req_sect > max_sect)
+ req_sect = max_sect;
+
+ /*
+ drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+ sect_in, device->rs_in_flight, want, correction,
+ steps, cps, device->rs_planed, curr_corr, req_sect);
+ */
+
+ return req_sect;
+}
+
+static int drbd_rs_number_requests(struct drbd_device *device)
+{
+ unsigned int sect_in; /* Number of sectors that came in since the last turn */
+ int number, mxb;
+
+ sect_in = atomic_xchg(&device->rs_sect_in, 0);
+ device->rs_in_flight -= sect_in;
+
+ rcu_read_lock();
+ mxb = drbd_get_max_buffers(device) / 2;
+ if (rcu_dereference(device->rs_plan_s)->size) {
+ number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
+ device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+ } else {
+ device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
+ number = SLEEP_TIME * device->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
+ }
+ rcu_read_unlock();
+
+ /* Don't have more than "max-buffers"/2 in-flight.
+ * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+ * potentially causing a distributed deadlock on congestion during
+ * online-verify or (checksum-based) resync, if max-buffers,
+ * socket buffer sizes and resync rate settings are mis-configured. */
+
+ /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+ * mxb (as used here, and in drbd_alloc_pages on the peer) is
+ * "number of pages" (typically also 4k),
+ * but "rs_in_flight" is in "sectors" (512 Byte). */
+ if (mxb - device->rs_in_flight/8 < number)
+ number = mxb - device->rs_in_flight/8;
+
+ return number;
+}
+
+static int make_resync_request(struct drbd_device *const device, int cancel)
+{
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+ unsigned long bit;
+ sector_t sector;
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ int max_bio_size;
+ int number, rollback_i, size;
+ int align, requeue = 0;
+ int i = 0;
+
+ if (unlikely(cancel))
+ return 0;
+
+ if (device->rs_total == 0) {
+ /* empty resync? */
+ drbd_resync_finished(device);
+ return 0;
+ }
+
+ if (!get_ldev(device)) {
+ /* Since we only need to access device->rsync a
+ get_ldev_if_state(device,D_FAILED) would be sufficient, but
+ to continue resync with a broken disk makes no sense at
+ all */
+ drbd_err(device, "Disk broke down during resync!\n");
+ return 0;
+ }
+
+ max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
+ number = drbd_rs_number_requests(device);
+ if (number <= 0)
+ goto requeue;
+
+ for (i = 0; i < number; i++) {
+ /* Stop generating RS requests when half of the send buffer is filled,
+ * but notify TCP that we'd like to have more space. */
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket) {
+ struct sock *sk = connection->data.socket->sk;
+ int queued = sk->sk_wmem_queued;
+ int sndbuf = sk->sk_sndbuf;
+ if (queued > sndbuf / 2) {
+ requeue = 1;
+ if (sk->sk_socket)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+ } else
+ requeue = 1;
+ mutex_unlock(&connection->data.mutex);
+ if (requeue)
+ goto requeue;
+
+next_sector:
+ size = BM_BLOCK_SIZE;
+ bit = drbd_bm_find_next(device, device->bm_resync_fo);
+
+ if (bit == DRBD_END_OF_BITMAP) {
+ device->bm_resync_fo = drbd_bm_bits(device);
+ put_ldev(device);
+ return 0;
+ }
+
+ sector = BM_BIT_TO_SECT(bit);
+
+ if (drbd_try_rs_begin_io(device, sector)) {
+ device->bm_resync_fo = bit;
+ goto requeue;
+ }
+ device->bm_resync_fo = bit + 1;
+
+ if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
+ drbd_rs_complete_io(device, sector);
+ goto next_sector;
+ }
+
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
+ /* try to find some adjacent bits.
+ * we stop if we have already the maximum req size.
+ *
+ * Additionally always align bigger requests, in order to
+ * be prepared for all stripe sizes of software RAIDs.
+ */
+ align = 1;
+ rollback_i = i;
+ while (i < number) {
+ if (size + BM_BLOCK_SIZE > max_bio_size)
+ break;
+
+ /* Be always aligned */
+ if (sector & ((1<<(align+3))-1))
+ break;
+
+ /* do not cross extent boundaries */
+ if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+ break;
+ /* now, is it actually dirty, after all?
+ * caution, drbd_bm_test_bit is tri-state for some
+ * obscure reason; ( b == 0 ) would get the out-of-band
+ * only accidentally right because of the "oddly sized"
+ * adjustment below */
+ if (drbd_bm_test_bit(device, bit+1) != 1)
+ break;
+ bit++;
+ size += BM_BLOCK_SIZE;
+ if ((BM_BLOCK_SIZE << align) <= size)
+ align++;
+ i++;
+ }
+ /* if we merged some,
+ * reset the offset to start the next drbd_bm_find_next from */
+ if (size > BM_BLOCK_SIZE)
+ device->bm_resync_fo = bit + 1;
+#endif
+
+ /* adjust very last sectors, in case we are oddly sized */
+ if (sector + (size>>9) > capacity)
+ size = (capacity-sector)<<9;
+
+ if (device->use_csums) {
+ switch (read_for_csum(peer_device, sector, size)) {
+ case -EIO: /* Disk failure */
+ put_ldev(device);
+ return -EIO;
+ case -EAGAIN: /* allocation failed, or ldev busy */
+ drbd_rs_complete_io(device, sector);
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ i = rollback_i;
+ goto requeue;
+ case 0:
+ /* everything ok */
+ break;
+ default:
+ BUG();
+ }
+ } else {
+ int err;
+
+ inc_rs_pending(device);
+ err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
+ sector, size, ID_SYNCER);
+ if (err) {
+ drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
+ dec_rs_pending(device);
+ put_ldev(device);
+ return err;
+ }
+ }
+ }
+
+ if (device->bm_resync_fo >= drbd_bm_bits(device)) {
+ /* last syncer _request_ was sent,
+ * but the P_RS_DATA_REPLY not yet received. sync will end (and
+ * next sync group will resume), as soon as we receive the last
+ * resync data block, and the last bit is cleared.
+ * until then resync "work" is "inactive" ...
+ */
+ put_ldev(device);
+ return 0;
+ }
+
+ requeue:
+ device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+ mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+ put_ldev(device);
+ return 0;
+}
+
+static int make_ov_request(struct drbd_device *device, int cancel)
+{
+ int number, i, size;
+ sector_t sector;
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ bool stop_sector_reached = false;
+
+ if (unlikely(cancel))
+ return 1;
+
+ number = drbd_rs_number_requests(device);
+
+ sector = device->ov_position;
+ for (i = 0; i < number; i++) {
+ if (sector >= capacity)
+ return 1;
+
+ /* We check for "finished" only in the reply path:
+ * w_e_end_ov_reply().
+ * We need to send at least one request out. */
+ stop_sector_reached = i > 0
+ && verify_can_do_stop_sector(device)
+ && sector >= device->ov_stop_sector;
+ if (stop_sector_reached)
+ break;
+
+ size = BM_BLOCK_SIZE;
+
+ if (drbd_try_rs_begin_io(device, sector)) {
+ device->ov_position = sector;
+ goto requeue;
+ }
+
+ if (sector + (size>>9) > capacity)
+ size = (capacity-sector)<<9;
+
+ inc_rs_pending(device);
+ if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
+ dec_rs_pending(device);
+ return 0;
+ }
+ sector += BM_SECT_PER_BIT;
+ }
+ device->ov_position = sector;
+
+ requeue:
+ device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+ if (i == 0 || !stop_sector_reached)
+ mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+ return 1;
+}
+
+int w_ov_finished(struct drbd_work *w, int cancel)
+{
+ struct drbd_device_work *dw =
+ container_of(w, struct drbd_device_work, w);
+ struct drbd_device *device = dw->device;
+ kfree(dw);
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+
+ return 0;
+}
+
+static int w_resync_finished(struct drbd_work *w, int cancel)
+{
+ struct drbd_device_work *dw =
+ container_of(w, struct drbd_device_work, w);
+ struct drbd_device *device = dw->device;
+ kfree(dw);
+
+ drbd_resync_finished(device);
+
+ return 0;
+}
+
+static void ping_peer(struct drbd_device *device)
+{
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+
+ clear_bit(GOT_PING_ACK, &connection->flags);
+ request_ping(connection);
+ wait_event(connection->ping_wait,
+ test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
+}
+
+int drbd_resync_finished(struct drbd_device *device)
+{
+ unsigned long db, dt, dbdt;
+ unsigned long n_oos;
+ union drbd_state os, ns;
+ struct drbd_device_work *dw;
+ char *khelper_cmd = NULL;
+ int verify_done = 0;
+
+ /* Remove all elements from the resync LRU. Since future actions
+ * might set bits in the (main) bitmap, then the entries in the
+ * resync LRU would be wrong. */
+ if (drbd_rs_del_all(device)) {
+ /* In case this is not possible now, most probably because
+ * there are P_RS_DATA_REPLY Packets lingering on the worker's
+ * queue (or even the read operations for those packets
+ * is not finished by now). Retry in 100ms. */
+
+ schedule_timeout_interruptible(HZ / 10);
+ dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
+ if (dw) {
+ dw->w.cb = w_resync_finished;
+ dw->device = device;
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &dw->w);
+ return 1;
+ }
+ drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
+ }
+
+ dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+ if (dt <= 0)
+ dt = 1;
+
+ db = device->rs_total;
+ /* adjust for verify start and stop sectors, respective reached position */
+ if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+ db -= device->ov_left;
+
+ dbdt = Bit2KB(db/dt);
+ device->rs_paused /= HZ;
+
+ if (!get_ldev(device))
+ goto out;
+
+ ping_peer(device);
+
+ spin_lock_irq(&device->resource->req_lock);
+ os = drbd_read_state(device);
+
+ verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
+
+ /* This protects us against multiple calls (that can happen in the presence
+ of application IO), and against connectivity loss just before we arrive here. */
+ if (os.conn <= C_CONNECTED)
+ goto out_unlock;
+
+ ns = os;
+ ns.conn = C_CONNECTED;
+
+ drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+ verify_done ? "Online verify" : "Resync",
+ dt + device->rs_paused, device->rs_paused, dbdt);
+
+ n_oos = drbd_bm_total_weight(device);
+
+ if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+ if (n_oos) {
+ drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
+ n_oos, Bit2KB(1));
+ khelper_cmd = "out-of-sync";
+ }
+ } else {
+ D_ASSERT(device, (n_oos - device->rs_failed) == 0);
+
+ if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+ khelper_cmd = "after-resync-target";
+
+ if (device->use_csums && device->rs_total) {
+ const unsigned long s = device->rs_same_csum;
+ const unsigned long t = device->rs_total;
+ const int ratio =
+ (t == 0) ? 0 :
+ (t < 100000) ? ((s*100)/t) : (s/(t/100));
+ drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
+ "transferred %luK total %luK\n",
+ ratio,
+ Bit2KB(device->rs_same_csum),
+ Bit2KB(device->rs_total - device->rs_same_csum),
+ Bit2KB(device->rs_total));
+ }
+ }
+
+ if (device->rs_failed) {
+ drbd_info(device, " %lu failed blocks\n", device->rs_failed);
+
+ if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+ ns.disk = D_INCONSISTENT;
+ ns.pdsk = D_UP_TO_DATE;
+ } else {
+ ns.disk = D_UP_TO_DATE;
+ ns.pdsk = D_INCONSISTENT;
+ }
+ } else {
+ ns.disk = D_UP_TO_DATE;
+ ns.pdsk = D_UP_TO_DATE;
+
+ if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+ if (device->p_uuid) {
+ int i;
+ for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+ _drbd_uuid_set(device, i, device->p_uuid[i]);
+ drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
+ _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
+ } else {
+ drbd_err(device, "device->p_uuid is NULL! BUG\n");
+ }
+ }
+
+ if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+ /* for verify runs, we don't update uuids here,
+ * so there would be nothing to report. */
+ drbd_uuid_set_bm(device, 0UL);
+ drbd_print_uuids(device, "updated UUIDs");
+ if (device->p_uuid) {
+ /* Now the two UUID sets are equal, update what we
+ * know of the peer. */
+ int i;
+ for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+ device->p_uuid[i] = device->ldev->md.uuid[i];
+ }
+ }
+ }
+
+ _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+out_unlock:
+ spin_unlock_irq(&device->resource->req_lock);
+ put_ldev(device);
+out:
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ device->rs_paused = 0;
+
+ /* reset start sector, if we reached end of device */
+ if (verify_done && device->ov_left == 0)
+ device->ov_start_sector = 0;
+
+ drbd_md_sync(device);
+
+ if (khelper_cmd)
+ drbd_khelper(device, khelper_cmd);
+
+ return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+ if (drbd_peer_req_has_active_page(peer_req)) {
+ /* This might happen if sendpage() has not finished */
+ int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ atomic_add(i, &device->pp_in_use_by_net);
+ atomic_sub(i, &device->pp_in_use);
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->net_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+ wake_up(&drbd_pp_wait);
+ } else
+ drbd_free_peer_req(device, peer_req);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @device: DRBD device.
+ * @w: work object.
+ * @cancel: The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ int err;
+
+ if (unlikely(cancel)) {
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
+ }
+
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Sending NegDReply. sector=%llus.\n",
+ (unsigned long long)peer_req->i.sector);
+
+ err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
+ }
+
+ dec_unacked(device);
+
+ move_to_net_ee_or_free(device, peer_req);
+
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block() failed\n");
+ return err;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
+ * @w: work object.
+ * @cancel: The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ int err;
+
+ if (unlikely(cancel)) {
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
+ }
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
+ }
+
+ if (device->state.conn == C_AHEAD) {
+ err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
+ } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ if (likely(device->state.pdsk >= D_INCONSISTENT)) {
+ inc_rs_pending(device);
+ err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Not sending RSDataReply, "
+ "partner DISKLESS!\n");
+ err = 0;
+ }
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
+ (unsigned long long)peer_req->i.sector);
+
+ err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+
+ /* update resync data with failure */
+ drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
+ }
+
+ dec_unacked(device);
+
+ move_to_net_ee_or_free(device, peer_req);
+
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block() failed\n");
+ return err;
+}
+
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ struct digest_info *di;
+ int digest_size;
+ void *digest = NULL;
+ int err, eq = 0;
+
+ if (unlikely(cancel)) {
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
+ }
+
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
+ }
+
+ di = peer_req->digest;
+
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ /* quick hack to try to avoid a race against reconfiguration.
+ * a real fix would be much more involved,
+ * introducing more locking mechanisms */
+ if (peer_device->connection->csums_tfm) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+ D_ASSERT(device, digest_size == di->digest_size);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ }
+ if (digest) {
+ drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+ eq = !memcmp(digest, di->digest, digest_size);
+ kfree(digest);
+ }
+
+ if (eq) {
+ drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
+ /* rs_same_csums unit is BM_BLOCK_SIZE */
+ device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+ err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
+ } else {
+ inc_rs_pending(device);
+ peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+ peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
+ kfree(di);
+ err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+ }
+ } else {
+ err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
+ }
+
+ dec_unacked(device);
+ move_to_net_ee_or_free(device, peer_req);
+
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block/ack() failed\n");
+ return err;
+}
+
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ int digest_size;
+ void *digest;
+ int err = 0;
+
+ if (unlikely(cancel))
+ goto out;
+
+ digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ if (!digest) {
+ err = 1; /* terminate the connection in case the allocation failed */
+ goto out;
+ }
+
+ if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+ drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+ else
+ memset(digest, 0, digest_size);
+
+ /* Free e and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
+ peer_req = NULL;
+ inc_rs_pending(device);
+ err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
+ if (err)
+ dec_rs_pending(device);
+ kfree(digest);
+
+out:
+ if (peer_req)
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return err;
+}
+
+void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
+{
+ if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
+ device->ov_last_oos_size += size>>9;
+ } else {
+ device->ov_last_oos_start = sector;
+ device->ov_last_oos_size = size>>9;
+ }
+ drbd_set_out_of_sync(device, sector, size);
+}
+
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ struct digest_info *di;
+ void *digest;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ int digest_size;
+ int err, eq = 0;
+ bool stop_sector_reached = false;
+
+ if (unlikely(cancel)) {
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
+ }
+
+ /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+ * the resync lru has been cleaned up already */
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
+ }
+
+ di = peer_req->digest;
+
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ if (digest) {
+ drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+
+ D_ASSERT(device, digest_size == di->digest_size);
+ eq = !memcmp(digest, di->digest, digest_size);
+ kfree(digest);
+ }
+ }
+
+ /* Free peer_req and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
+ if (!eq)
+ drbd_ov_out_of_sync_found(device, sector, size);
+ else
+ ov_out_of_sync_print(device);
+
+ err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
+ eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+ dec_unacked(device);
+
+ --device->ov_left;
+
+ /* let's advance progress step marks only for every other megabyte */
+ if ((device->ov_left & 0x200) == 0x200)
+ drbd_advance_rs_marks(device, device->ov_left);
+
+ stop_sector_reached = verify_can_do_stop_sector(device) &&
+ (sector + (size>>9)) >= device->ov_stop_sector;
+
+ if (device->ov_left == 0 || stop_sector_reached) {
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+ }
+
+ return err;
+}
+
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+static int drbd_send_barrier(struct drbd_connection *connection)
+{
+ struct p_barrier *p;
+ struct drbd_socket *sock;
+
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+ p->barrier = connection->send.current_epoch_nr;
+ p->pad = 0;
+ connection->send.current_epoch_writes = 0;
+
+ return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
+}
+
+int w_send_write_hint(struct drbd_work *w, int cancel)
+{
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, unplug_work);
+ struct drbd_socket *sock;
+
+ if (cancel)
+ return 0;
+ sock = &first_peer_device(device)->connection->data;
+ if (!drbd_prepare_command(first_peer_device(device), sock))
+ return -EIO;
+ return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
+}
+
+static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
+{
+ if (!connection->send.seen_any_write_yet) {
+ connection->send.seen_any_write_yet = true;
+ connection->send.current_epoch_nr = epoch;
+ connection->send.current_epoch_writes = 0;
+ }
+}
+
+static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
+{
+ /* re-init if first write on this connection */
+ if (!connection->send.seen_any_write_yet)
+ return;
+ if (connection->send.current_epoch_nr != epoch) {
+ if (connection->send.current_epoch_writes)
+ drbd_send_barrier(connection);
+ connection->send.current_epoch_nr = epoch;
+ }
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device->connection;
+ int err;
+
+ if (unlikely(cancel)) {
+ req_mod(req, SEND_CANCELED);
+ return 0;
+ }
+ req->pre_send_jif = jiffies;
+
+ /* this time, no connection->send.current_epoch_writes++;
+ * If it was sent, it was the closing barrier for the last
+ * replicated epoch, before we went into AHEAD mode.
+ * No more barriers will be sent, until we leave AHEAD mode again. */
+ maybe_send_barrier(connection, req->epoch);
+
+ err = drbd_send_out_of_sync(peer_device, req);
+ req_mod(req, OOS_HANDED_TO_NETWORK);
+
+ return err;
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @w: work object.
+ * @cancel: The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device->connection;
+ int err;
+
+ if (unlikely(cancel)) {
+ req_mod(req, SEND_CANCELED);
+ return 0;
+ }
+ req->pre_send_jif = jiffies;
+
+ re_init_if_first_write(connection, req->epoch);
+ maybe_send_barrier(connection, req->epoch);
+ connection->send.current_epoch_writes++;
+
+ err = drbd_send_dblock(peer_device, req);
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+ return err;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @w: work object.
+ * @cancel: The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device->connection;
+ int err;
+
+ if (unlikely(cancel)) {
+ req_mod(req, SEND_CANCELED);
+ return 0;
+ }
+ req->pre_send_jif = jiffies;
+
+ /* Even read requests may close a write epoch,
+ * if there was any yet. */
+ maybe_send_barrier(connection, req->epoch);
+
+ err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
+ (unsigned long)req);
+
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+ return err;
+}
+
+int w_restart_disk_io(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+
+ if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+ drbd_al_begin_io(device, &req->i);
+
+ drbd_req_make_private_bio(req, req->master_bio);
+ req->private_bio->bi_bdev = device->ldev->backing_bdev;
+ generic_make_request(req->private_bio);
+
+ return 0;
+}
+
+static int _drbd_may_sync_now(struct drbd_device *device)
+{
+ struct drbd_device *odev = device;
+ int resync_after;
+
+ while (1) {
+ if (!odev->ldev || odev->state.disk == D_DISKLESS)
+ return 1;
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
+ if (resync_after == -1)
+ return 1;
+ odev = minor_to_device(resync_after);
+ if (!odev)
+ return 1;
+ if ((odev->state.conn >= C_SYNC_SOURCE &&
+ odev->state.conn <= C_PAUSED_SYNC_T) ||
+ odev->state.aftr_isp || odev->state.peer_isp ||
+ odev->state.user_isp)
+ return 0;
+ }
+}
+
+/**
+ * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @device: DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+static int _drbd_pause_after(struct drbd_device *device)
+{
+ struct drbd_device *odev;
+ int i, rv = 0;
+
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, odev, i) {
+ if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+ continue;
+ if (!_drbd_may_sync_now(odev))
+ rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+ != SS_NOTHING_TO_DO);
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/**
+ * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * @device: DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+static int _drbd_resume_next(struct drbd_device *device)
+{
+ struct drbd_device *odev;
+ int i, rv = 0;
+
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, odev, i) {
+ if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+ continue;
+ if (odev->state.aftr_isp) {
+ if (_drbd_may_sync_now(odev))
+ rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+ CS_HARD, NULL)
+ != SS_NOTHING_TO_DO) ;
+ }
+ }
+ rcu_read_unlock();
+ return rv;
+}
+
+void resume_next_sg(struct drbd_device *device)
+{
+ write_lock_irq(&global_state_lock);
+ _drbd_resume_next(device);
+ write_unlock_irq(&global_state_lock);
+}
+
+void suspend_other_sg(struct drbd_device *device)
+{
+ write_lock_irq(&global_state_lock);
+ _drbd_pause_after(device);
+ write_unlock_irq(&global_state_lock);
+}
+
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
+{
+ struct drbd_device *odev;
+ int resync_after;
+
+ if (o_minor == -1)
+ return NO_ERROR;
+ if (o_minor < -1 || o_minor > MINORMASK)
+ return ERR_RESYNC_AFTER;
+
+ /* check for loops */
+ odev = minor_to_device(o_minor);
+ while (1) {
+ if (odev == device)
+ return ERR_RESYNC_AFTER_CYCLE;
+
+ /* You are free to depend on diskless, non-existing,
+ * or not yet/no longer existing minors.
+ * We only reject dependency loops.
+ * We cannot follow the dependency chain beyond a detached or
+ * missing minor.
+ */
+ if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+ return NO_ERROR;
+
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
+ /* dependency chain ends here, no cycles. */
+ if (resync_after == -1)
+ return NO_ERROR;
+
+ /* follow the dependency chain */
+ odev = minor_to_device(resync_after);
+ }
+}
+
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_device *device)
+{
+ int changes;
+
+ do {
+ changes = _drbd_pause_after(device);
+ changes |= _drbd_resume_next(device);
+ } while (changes);
+}
+
+void drbd_rs_controller_reset(struct drbd_device *device)
+{
+ struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+ struct fifo_buffer *plan;
+
+ atomic_set(&device->rs_sect_in, 0);
+ atomic_set(&device->rs_sect_ev, 0);
+ device->rs_in_flight = 0;
+ device->rs_last_events =
+ (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]);
+
+ /* Updating the RCU protected object in place is necessary since
+ this function gets called from atomic context.
+ It is valid since all other updates also lead to an completely
+ empty fifo */
+ rcu_read_lock();
+ plan = rcu_dereference(device->rs_plan_s);
+ plan->total = 0;
+ fifo_set(plan, 0);
+ rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+ drbd_device_post_work(device, RS_START);
+}
+
+static void do_start_resync(struct drbd_device *device)
+{
+ if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
+ drbd_warn(device, "postponing start_resync ...\n");
+ device->start_resync_timer.expires = jiffies + HZ/10;
+ add_timer(&device->start_resync_timer);
+ return;
+ }
+
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+ bool csums_after_crash_only;
+ rcu_read_lock();
+ csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+ rcu_read_unlock();
+ return connection->agreed_pro_version >= 89 && /* supported? */
+ connection->csums_tfm && /* configured? */
+ (csums_after_crash_only == 0 /* use for each resync? */
+ || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @device: DRBD device.
+ * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
+{
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ union drbd_state ns;
+ int r;
+
+ if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
+ drbd_err(device, "Resync already running!\n");
+ return;
+ }
+
+ if (!test_bit(B_RS_H_DONE, &device->flags)) {
+ if (side == C_SYNC_TARGET) {
+ /* Since application IO was locked out during C_WF_BITMAP_T and
+ C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+ we check that we might make the data inconsistent. */
+ r = drbd_khelper(device, "before-resync-target");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ drbd_info(device, "before-resync-target handler returned %d, "
+ "dropping connection.\n", r);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return;
+ }
+ } else /* C_SYNC_SOURCE */ {
+ r = drbd_khelper(device, "before-resync-source");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ if (r == 3) {
+ drbd_info(device, "before-resync-source handler returned %d, "
+ "ignoring. Old userland tools?", r);
+ } else {
+ drbd_info(device, "before-resync-source handler returned %d, "
+ "dropping connection.\n", r);
+ conn_request_state(connection,
+ NS(conn, C_DISCONNECTING), CS_HARD);
+ return;
+ }
+ }
+ }
+ }
+
+ if (current == connection->worker.task) {
+ /* The worker should not sleep waiting for state_mutex,
+ that can take long */
+ if (!mutex_trylock(device->state_mutex)) {
+ set_bit(B_RS_H_DONE, &device->flags);
+ device->start_resync_timer.expires = jiffies + HZ/5;
+ add_timer(&device->start_resync_timer);
+ return;
+ }
+ } else {
+ mutex_lock(device->state_mutex);
+ }
+ clear_bit(B_RS_H_DONE, &device->flags);
+
+ /* req_lock: serialize with drbd_send_and_submit() and others
+ * global_state_lock: for stable sync-after dependencies */
+ spin_lock_irq(&device->resource->req_lock);
+ write_lock(&global_state_lock);
+ /* Did some connection breakage or IO error race with us? */
+ if (device->state.conn < C_CONNECTED
+ || !get_ldev_if_state(device, D_NEGOTIATING)) {
+ write_unlock(&global_state_lock);
+ spin_unlock_irq(&device->resource->req_lock);
+ mutex_unlock(device->state_mutex);
+ return;
+ }
+
+ ns = drbd_read_state(device);
+
+ ns.aftr_isp = !_drbd_may_sync_now(device);
+
+ ns.conn = side;
+
+ if (side == C_SYNC_TARGET)
+ ns.disk = D_INCONSISTENT;
+ else /* side == C_SYNC_SOURCE */
+ ns.pdsk = D_INCONSISTENT;
+
+ r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+ ns = drbd_read_state(device);
+
+ if (ns.conn < C_CONNECTED)
+ r = SS_UNKNOWN_ERROR;
+
+ if (r == SS_SUCCESS) {
+ unsigned long tw = drbd_bm_total_weight(device);
+ unsigned long now = jiffies;
+ int i;
+
+ device->rs_failed = 0;
+ device->rs_paused = 0;
+ device->rs_same_csum = 0;
+ device->rs_last_sect_ev = 0;
+ device->rs_total = tw;
+ device->rs_start = now;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = tw;
+ device->rs_mark_time[i] = now;
+ }
+ _drbd_pause_after(device);
+ /* Forget potentially stale cached per resync extent bit-counts.
+ * Open coded drbd_rs_cancel_all(device), we already have IRQs
+ * disabled, and know the disk state is ok. */
+ spin_lock(&device->al_lock);
+ lc_reset(device->resync);
+ device->resync_locked = 0;
+ device->resync_wenr = LC_FREE;
+ spin_unlock(&device->al_lock);
+ }
+ write_unlock(&global_state_lock);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (r == SS_SUCCESS) {
+ wake_up(&device->al_wait); /* for lc_reset() above */
+ /* reset rs_last_bcast when a resync or verify is started,
+ * to deal with potential jiffies wrap. */
+ device->rs_last_bcast = jiffies - HZ;
+
+ drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+ drbd_conn_str(ns.conn),
+ (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
+ (unsigned long) device->rs_total);
+ if (side == C_SYNC_TARGET) {
+ device->bm_resync_fo = 0;
+ device->use_csums = use_checksum_based_resync(connection, device);
+ } else {
+ device->use_csums = 0;
+ }
+
+ /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+ * with w_send_oos, or the sync target will get confused as to
+ * how much bits to resync. We cannot do that always, because for an
+ * empty resync and protocol < 95, we need to do it here, as we call
+ * drbd_resync_finished from here in that case.
+ * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+ * and from after_state_ch otherwise. */
+ if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+ drbd_gen_and_send_sync_uuid(peer_device);
+
+ if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
+ /* This still has a race (about when exactly the peers
+ * detect connection loss) that can lead to a full sync
+ * on next handshake. In 8.3.9 we fixed this with explicit
+ * resync-finished notifications, but the fix
+ * introduces a protocol change. Sleeping for some
+ * time longer than the ping interval + timeout on the
+ * SyncSource, to give the SyncTarget the chance to
+ * detect connection loss, then waiting for a ping
+ * response (implicit in drbd_resync_finished) reduces
+ * the race considerably, but does not solve it. */
+ if (side == C_SYNC_SOURCE) {
+ struct net_conf *nc;
+ int timeo;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
+ }
+ drbd_resync_finished(device);
+ }
+
+ drbd_rs_controller_reset(device);
+ /* ns.conn may already be != device->state.conn,
+ * we may have been paused in between, or become paused until
+ * the timer triggers.
+ * No matter, that is handled in resync_timer_fn() */
+ if (ns.conn == C_SYNC_TARGET)
+ mod_timer(&device->resync_timer, jiffies);
+
+ drbd_md_sync(device);
+ }
+ put_ldev(device);
+ mutex_unlock(device->state_mutex);
+}
+
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
+{
+ struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+ device->rs_last_bcast = jiffies;
+
+ if (!get_ldev(device))
+ return;
+
+ drbd_bm_write_lazy(device, 0);
+ if (resync_done && is_sync_state(device->state.conn))
+ drbd_resync_finished(device);
+
+ drbd_bcast_event(device, &sib);
+ /* update timestamp, in case it took a while to write out stuff */
+ device->rs_last_bcast = jiffies;
+ put_ldev(device);
+}
+
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+ lc_destroy(device->resync);
+ device->resync = NULL;
+ lc_destroy(device->act_log);
+ device->act_log = NULL;
+
+ __acquire(local);
+ drbd_free_ldev(device->ldev);
+ device->ldev = NULL;
+ __release(local);
+
+ clear_bit(GOING_DISKLESS, &device->flags);
+ wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+ D_ASSERT(device, device->state.disk == D_FAILED);
+ /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+ * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+ * the protected members anymore, though, so once put_ldev reaches zero
+ * again, it will be safe to free them. */
+
+ /* Try to write changed bitmap pages, read errors may have just
+ * set some bits outside the area covered by the activity log.
+ *
+ * If we have an IO error during the bitmap writeout,
+ * we will want a full sync next time, just in case.
+ * (Do we want a specific meta data flag for this?)
+ *
+ * If that does not make it to stable storage either,
+ * we cannot do anything about that anymore.
+ *
+ * We still need to check if both bitmap and ldev are present, we may
+ * end up here after a failed attach, before ldev was even assigned.
+ */
+ if (device->bitmap && device->ldev) {
+ /* An interrupted resync or similar is allowed to recounts bits
+ * while we detach.
+ * Any modifications would not be expected anymore, though.
+ */
+ if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+ "detach", BM_LOCKED_TEST_ALLOWED)) {
+ if (test_bit(WAS_READ_ERROR, &device->flags)) {
+ drbd_md_set_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ }
+ }
+ }
+
+ drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+static int do_md_sync(struct drbd_device *device)
+{
+ drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+ drbd_md_sync(device);
+ return 0;
+}
+
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+ struct drbd_thread_timing_details *tdp,
+ unsigned int *cb_nr,
+ void *cb,
+ const char *fn, const unsigned int line)
+{
+ unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+ struct drbd_thread_timing_details *td = tdp + i;
+
+ td->start_jif = jiffies;
+ td->cb_addr = cb;
+ td->caller_fn = fn;
+ td->line = line;
+ td->cb_nr = *cb_nr;
+
+ i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+ td = tdp + i;
+ memset(td, 0, sizeof(*td));
+
+ ++(*cb_nr);
+}
+
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+ if (test_bit(MD_SYNC, &todo))
+ do_md_sync(device);
+ if (test_bit(RS_DONE, &todo) ||
+ test_bit(RS_PROGRESS, &todo))
+ update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
+ if (test_bit(GO_DISKLESS, &todo))
+ go_diskless(device);
+ if (test_bit(DESTROY_DISK, &todo))
+ drbd_ldev_destroy(device);
+ if (test_bit(RS_START, &todo))
+ do_start_resync(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK \
+ ((1UL << GO_DISKLESS) \
+ |(1UL << DESTROY_DISK) \
+ |(1UL << MD_SYNC) \
+ |(1UL << RS_START) \
+ |(1UL << RS_PROGRESS) \
+ |(1UL << RS_DONE) \
+ )
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+ unsigned long old, new;
+ do {
+ old = *flags;
+ new = old & ~DRBD_DEVICE_WORK_MASK;
+ } while (cmpxchg(flags, old, new) != old);
+ return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ unsigned long todo = get_work_bits(&device->flags);
+ if (!todo)
+ continue;
+
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ do_device_work(device, todo);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+ spin_lock_irq(&queue->q_lock);
+ list_splice_tail_init(&queue->q, work_list);
+ spin_unlock_irq(&queue->q_lock);
+ return !list_empty(work_list);
+}
+
+static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
+{
+ DEFINE_WAIT(wait);
+ struct net_conf *nc;
+ int uncork, cork;
+
+ dequeue_work_batch(&connection->sender_work, work_list);
+ if (!list_empty(work_list))
+ return;
+
+ /* Still nothing to do?
+ * Maybe we still need to close the current epoch,
+ * even if no new requests are queued yet.
+ *
+ * Also, poke TCP, just in case.
+ * Then wait for new work (or signal). */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ uncork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ if (uncork) {
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket)
+ drbd_tcp_uncork(connection->data.socket);
+ mutex_unlock(&connection->data.mutex);
+ }
+
+ for (;;) {
+ int send_barrier;
+ prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_lock_irq(&connection->resource->req_lock);
+ spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ if (!list_empty(&connection->sender_work.q))
+ list_splice_tail_init(&connection->sender_work.q, work_list);
+ spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ if (!list_empty(work_list) || signal_pending(current)) {
+ spin_unlock_irq(&connection->resource->req_lock);
+ break;
+ }
+
+ /* We found nothing new to do, no to-be-communicated request,
+ * no other work item. We may still need to close the last
+ * epoch. Next incoming request epoch will be connection ->
+ * current transfer log epoch number. If that is different
+ * from the epoch of the last request we communicated, it is
+ * safe to send the epoch separating barrier now.
+ */
+ send_barrier =
+ atomic_read(&connection->current_tle_nr) !=
+ connection->send.current_epoch_nr;
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ if (send_barrier)
+ maybe_send_barrier(connection,
+ connection->send.current_epoch_nr + 1);
+
+ if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
+ break;
+
+ /* drbd_send() may have called flush_signals() */
+ if (get_t_state(&connection->worker) != RUNNING)
+ break;
+
+ schedule();
+ /* may be woken up for other things but new work, too,
+ * e.g. if the current epoch got closed.
+ * In which case we send the barrier above. */
+ }
+ finish_wait(&connection->sender_work.q_wait, &wait);
+
+ /* someone may have changed the config while we have been waiting above. */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ cork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket) {
+ if (cork)
+ drbd_tcp_cork(connection->data.socket);
+ else if (!uncork)
+ drbd_tcp_uncork(connection->data.socket);
+ }
+ mutex_unlock(&connection->data.mutex);
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+ struct drbd_connection *connection = thi->connection;
+ struct drbd_work *w = NULL;
+ struct drbd_peer_device *peer_device;
+ LIST_HEAD(work_list);
+ int vnr;
+
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
+
+ if (list_empty(&work_list)) {
+ update_worker_timing_details(connection, wait_for_work);
+ wait_for_work(connection, &work_list);
+ }
+
+ if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+ update_worker_timing_details(connection, do_unqueued_work);
+ do_unqueued_work(connection);
+ }
+
+ if (signal_pending(current)) {
+ flush_signals(current);
+ if (get_t_state(thi) == RUNNING) {
+ drbd_warn(connection, "Worker got an unexpected signal\n");
+ continue;
+ }
+ break;
+ }
+
+ if (get_t_state(thi) != RUNNING)
+ break;
+
+ if (!list_empty(&work_list)) {
+ w = list_first_entry(&work_list, struct drbd_work, list);
+ list_del_init(&w->list);
+ update_worker_timing_details(connection, w->cb);
+ if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
+ continue;
+ if (connection->cstate >= C_WF_REPORT_PARAMS)
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+ }
+ }
+
+ do {
+ if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+ update_worker_timing_details(connection, do_unqueued_work);
+ do_unqueued_work(connection);
+ }
+ if (!list_empty(&work_list)) {
+ w = list_first_entry(&work_list, struct drbd_work, list);
+ list_del_init(&w->list);
+ update_worker_timing_details(connection, w->cb);
+ w->cb(w, 1);
+ } else
+ dequeue_work_batch(&connection->sender_work, &work_list);
+ } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_device_cleanup(device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ return 0;
+}