24 files changed, 27117 insertions, 0 deletions
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000..7845bd6ee
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,73 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS or INET not selected"
+	depends on PROC_FS='n' || INET='n'
+
+config BLK_DEV_DRBD
+	tristate "DRBD Distributed Replicated Block Device support"
+	depends on PROC_FS && INET
+	select LRU_CACHE
+	select LIBCRC32C
+	default n
+	help
+
+	  NOTE: In order to authenticate connections you have to select
+	  CRYPTO_HMAC and a hash function as well.
+
+	  DRBD is a shared-nothing, synchronously replicated block device. It
+	  is designed to serve as a building block for high availability
+	  clusters and in this context, is a "drop-in" replacement for shared
+	  storage. Simplistically, you could see it as a network RAID 1.
+
+	  Each minor device has a role, which can be 'primary' or 'secondary'.
+	  On the node with the primary device the application is supposed to
+	  run and to access the device (/dev/drbdX). Every write is sent to
+	  the local 'lower level block device' and, across the network, to the
+	  node with the device in 'secondary' state.  The secondary device
+	  simply writes the data to its lower level block device.
+
+	  DRBD can also be used in dual-Primary mode (device writable on both
+	  nodes), which means it can exhibit shared disk semantics in a
+	  shared-nothing cluster.  Needless to say, on top of dual-Primary
+	  DRBD utilizing a cluster file system is necessary to maintain for
+	  cache coherency.
+
+	  For automatic failover you need a cluster manager (e.g. heartbeat).
+	  See also: http://www.drbd.org/, http://www.linux-ha.org
+
+	  If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+	bool "DRBD fault injection"
+	depends on BLK_DEV_DRBD
+	help
+
+	  Say Y here if you want to simulate IO errors, in order to test DRBD's
+	  behavior.
+
+	  The actual simulation of IO errors is done by writing 3 values to
+	  /sys/module/drbd/parameters/
+
+	  enable_faults: bitmask of...
+	  1	meta data write
+	  2               read
+	  4	resync data write
+	  8	            read
+	  16	data write
+	  32	data read
+	  64	read ahead
+	  128	kmalloc of bitmap
+	  256	allocation of peer_requests
+	  512	insert data corruption on receiving side
+
+	  fault_devs: bitmask of minor numbers
+	  fault_rate: frequency in percent
+
+	  Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+		echo 16 > /sys/module/drbd/parameters/enable_faults
+		echo 1 > /sys/module/drbd/parameters/fault_devs
+		echo 5 > /sys/module/drbd/parameters/fault_rate
+
+	  If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000..4464e353c
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,8 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
+
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000..1318e3217
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1219 @@
+/*
+   drbd_actlog.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include <linux/dynamic_debug.h>
+#include "drbd_int.h"
+
+
+enum al_transaction_types {
+	AL_TR_UPDATE = 0,
+	AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+	/* don't we all like magic */
+	__be32	magic;
+
+	/* to identify the most recent transaction block
+	 * in the on disk ring buffer */
+	__be32	tr_number;
+
+	/* checksum on the full 4k block, with this field set to 0. */
+	__be32	crc32c;
+
+	/* type of transaction, special transaction types like:
+	 * purge-all, set-all-idle, set-all-active, ... to-be-defined
+	 * see also enum al_transaction_types */
+	__be16	transaction_type;
+
+	/* we currently allow only a few thousand extents,
+	 * so 16bit will be enough for the slot number. */
+
+	/* how many updates in this transaction */
+	__be16	n_updates;
+
+	/* maximum slot number, "al-extents" in drbd.conf speak.
+	 * Having this in each transaction should make reconfiguration
+	 * of that parameter easier. */
+	__be16	context_size;
+
+	/* slot number the context starts with */
+	__be16	context_start_slot_nr;
+
+	/* Some reserved bytes.  Expected usage is a 64bit counter of
+	 * sectors-written since device creation, and other data generation tag
+	 * supporting usage */
+	__be32	__reserved[4];
+
+	/* --- 36 byte used --- */
+
+	/* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+	 * in one transaction, then use the remaining byte in the 4k block for
+	 * context information.  "Flexible" number of updates per transaction
+	 * does not help, as we have to account for the case when all update
+	 * slots are used anyways, so it would only complicate code without
+	 * additional benefit.
+	 */
+	__be16	update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+	/* but the extent number is 32bit, which at an extent size of 4 MiB
+	 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+	__be32	update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+	/* --- 420 bytes used (36 + 64*6) --- */
+
+	/* 4096 - 420 = 3676 = 919 * 4 */
+	__be32	context[AL_CONTEXT_PER_TRANSACTION];
+};
+
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
+{
+	int r;
+
+	wait_event(device->misc_wait,
+		   (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
+		   device->state.disk <= D_FAILED);
+
+	if (r)
+		return NULL;
+
+	device->md_io.current_use = intent;
+	device->md_io.start_jif = jiffies;
+	device->md_io.submit_jif = device->md_io.start_jif - 1;
+	return page_address(device->md_io.page);
+}
+
+void drbd_md_put_buffer(struct drbd_device *device)
+{
+	if (atomic_dec_and_test(&device->md_io.in_use))
+		wake_up(&device->misc_wait);
+}
+
+void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev,
+				     unsigned int *done)
+{
+	long dt;
+
+	rcu_read_lock();
+	dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+	rcu_read_unlock();
+	dt = dt * HZ / 10;
+	if (dt == 0)
+		dt = MAX_SCHEDULE_TIMEOUT;
+
+	dt = wait_event_timeout(device->misc_wait,
+			*done || test_bit(FORCE_DETACH, &device->flags), dt);
+	if (dt == 0) {
+		drbd_err(device, "meta-data IO operation timed out\n");
+		drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH);
+	}
+}
+
+static int _drbd_md_sync_page_io(struct drbd_device *device,
+				 struct drbd_backing_dev *bdev,
+				 sector_t sector, int rw)
+{
+	struct bio *bio;
+	/* we do all our meta data IO in aligned 4k blocks. */
+	const int size = 4096;
+	int err;
+
+	device->md_io.done = 0;
+	device->md_io.error = -ENODEV;
+
+	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
+		rw |= REQ_FUA | REQ_FLUSH;
+	rw |= REQ_SYNC | REQ_NOIDLE;
+
+	bio = bio_alloc_drbd(GFP_NOIO);
+	bio->bi_bdev = bdev->md_bdev;
+	bio->bi_iter.bi_sector = sector;
+	err = -EIO;
+	if (bio_add_page(bio, device->md_io.page, size, 0) != size)
+		goto out;
+	bio->bi_private = device;
+	bio->bi_end_io = drbd_md_endio;
+	bio->bi_rw = rw;
+
+	if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL)
+		/* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+		;
+	else if (!get_ldev_if_state(device, D_ATTACHING)) {
+		/* Corresponding put_ldev in drbd_md_endio() */
+		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+		err = -ENODEV;
+		goto out;
+	}
+
+	bio_get(bio); /* one bio_put() is in the completion handler */
+	atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+	device->md_io.submit_jif = jiffies;
+	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+		bio_endio(bio, -EIO);
+	else
+		submit_bio(rw, bio);
+	wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
+	if (bio_flagged(bio, BIO_UPTODATE))
+		err = device->md_io.error;
+
+ out:
+	bio_put(bio);
+	return err;
+}
+
+int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev,
+			 sector_t sector, int rw)
+{
+	int err;
+	D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
+
+	BUG_ON(!bdev->md_bdev);
+
+	dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
+	     current->comm, current->pid, __func__,
+	     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+	     (void*)_RET_IP_ );
+
+	if (sector < drbd_md_first_sector(bdev) ||
+	    sector + 7 > drbd_md_last_sector(bdev))
+		drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+		     current->comm, current->pid, __func__,
+		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+	err = _drbd_md_sync_page_io(device, bdev, sector, rw);
+	if (err) {
+		drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
+	}
+	return err;
+}
+
+static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr)
+{
+	struct lc_element *tmp;
+	tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+	if (unlikely(tmp != NULL)) {
+		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+			return bm_ext;
+	}
+	return NULL;
+}
+
+static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock)
+{
+	struct lc_element *al_ext;
+	struct bm_extent *bm_ext;
+	int wake;
+
+	spin_lock_irq(&device->al_lock);
+	bm_ext = find_active_resync_extent(device, enr);
+	if (bm_ext) {
+		wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+		spin_unlock_irq(&device->al_lock);
+		if (wake)
+			wake_up(&device->al_wait);
+		return NULL;
+	}
+	if (nonblock)
+		al_ext = lc_try_get(device->act_log, enr);
+	else
+		al_ext = lc_get(device->act_log, enr);
+	spin_unlock_irq(&device->al_lock);
+	return al_ext;
+}
+
+bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+
+	D_ASSERT(device, (unsigned)(last - first) <= 1);
+	D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+	/* FIXME figure out a fast path for bios crossing AL extent boundaries */
+	if (first != last)
+		return false;
+
+	return _al_get(device, first, true);
+}
+
+bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned enr;
+	bool need_transaction = false;
+
+	D_ASSERT(device, first <= last);
+	D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *al_ext;
+		wait_event(device->al_wait,
+				(al_ext = _al_get(device, enr, false)) != NULL);
+		if (al_ext->lc_number != enr)
+			need_transaction = true;
+	}
+	return need_transaction;
+}
+
+static int al_write_transaction(struct drbd_device *device);
+
+void drbd_al_begin_io_commit(struct drbd_device *device)
+{
+	bool locked = false;
+
+	/* Serialize multiple transactions.
+	 * This uses test_and_set_bit, memory barrier is implicit.
+	 */
+	wait_event(device->al_wait,
+			device->act_log->pending_changes == 0 ||
+			(locked = lc_try_lock_for_transaction(device->act_log)));
+
+	if (locked) {
+		/* Double check: it may have been committed by someone else,
+		 * while we have been waiting for the lock. */
+		if (device->act_log->pending_changes) {
+			bool write_al_updates;
+
+			rcu_read_lock();
+			write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+			rcu_read_unlock();
+
+			if (write_al_updates)
+				al_write_transaction(device);
+			spin_lock_irq(&device->al_lock);
+			/* FIXME
+			if (err)
+				we need an "lc_cancel" here;
+			*/
+			lc_committed(device->act_log);
+			spin_unlock_irq(&device->al_lock);
+		}
+		lc_unlock(device->act_log);
+		wake_up(&device->al_wait);
+	}
+}
+
+/*
+ * @delegate:   delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
+{
+	if (drbd_al_begin_io_prepare(device, i))
+		drbd_al_begin_io_commit(device);
+}
+
+int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
+{
+	struct lru_cache *al = device->act_log;
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned nr_al_extents;
+	unsigned available_update_slots;
+	unsigned enr;
+
+	D_ASSERT(device, first <= last);
+
+	nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+	available_update_slots = min(al->nr_elements - al->used,
+				al->max_pending_changes - al->pending_changes);
+
+	/* We want all necessary updates for a given request within the same transaction
+	 * We could first check how many updates are *actually* needed,
+	 * and use that instead of the worst-case nr_al_extents */
+	if (available_update_slots < nr_al_extents) {
+		/* Too many activity log extents are currently "hot".
+		 *
+		 * If we have accumulated pending changes already,
+		 * we made progress.
+		 *
+		 * If we cannot get even a single pending change through,
+		 * stop the fast path until we made some progress,
+		 * or requests to "cold" extents could be starved. */
+		if (!al->pending_changes)
+			__set_bit(__LC_STARVING, &device->act_log->flags);
+		return -ENOBUFS;
+	}
+
+	/* Is resync active in this area? */
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *tmp;
+		tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+		if (unlikely(tmp != NULL)) {
+			struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+			if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+				if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
+					return -EBUSY;
+				return -EWOULDBLOCK;
+			}
+		}
+	}
+
+	/* Checkout the refcounts.
+	 * Given that we checked for available elements and update slots above,
+	 * this has to be successful. */
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *al_ext;
+		al_ext = lc_get_cumulative(device->act_log, enr);
+		if (!al_ext)
+			drbd_info(device, "LOGIC BUG for enr=%u\n", enr);
+	}
+	return 0;
+}
+
+void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned enr;
+	struct lc_element *extent;
+	unsigned long flags;
+
+	D_ASSERT(device, first <= last);
+	spin_lock_irqsave(&device->al_lock, flags);
+
+	for (enr = first; enr <= last; enr++) {
+		extent = lc_find(device->act_log, enr);
+		if (!extent) {
+			drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr);
+			continue;
+		}
+		lc_put(device->act_log, extent);
+	}
+	spin_unlock_irqrestore(&device->al_lock, flags);
+	wake_up(&device->al_wait);
+}
+
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+	return al_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+	const unsigned int stripes = device->ldev->md.al_stripes;
+	const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+
+	/* transaction number, modulo on-disk ring buffer wrap around */
+	unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+
+	/* ... to aligned 4k on disk block */
+	t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+	/* ... to 512 byte sector in activity log */
+	t *= 8;
+
+	/* ... plus offset to the on disk position */
+	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+
+int al_write_transaction(struct drbd_device *device)
+{
+	struct al_transaction_on_disk *buffer;
+	struct lc_element *e;
+	sector_t sector;
+	int i, mx;
+	unsigned extent_nr;
+	unsigned crc = 0;
+	int err = 0;
+
+	if (!get_ldev(device)) {
+		drbd_err(device, "disk is %s, cannot start al transaction\n",
+			drbd_disk_str(device->state.disk));
+		return -EIO;
+	}
+
+	/* The bitmap write may have failed, causing a state change. */
+	if (device->state.disk < D_INCONSISTENT) {
+		drbd_err(device,
+			"disk is %s, cannot write al transaction\n",
+			drbd_disk_str(device->state.disk));
+		put_ldev(device);
+		return -EIO;
+	}
+
+	/* protects md_io_buffer, al_tr_cycle, ... */
+	buffer = drbd_md_get_buffer(device, __func__);
+	if (!buffer) {
+		drbd_err(device, "disk failed while waiting for md_io buffer\n");
+		put_ldev(device);
+		return -ENODEV;
+	}
+
+	memset(buffer, 0, sizeof(*buffer));
+	buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+	buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+	i = 0;
+
+	/* Even though no one can start to change this list
+	 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+	 * lc_try_lock_for_transaction() --, someone may still
+	 * be in the process of changing it. */
+	spin_lock_irq(&device->al_lock);
+	list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+		if (i == AL_UPDATES_PER_TRANSACTION) {
+			i++;
+			break;
+		}
+		buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+		buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+		if (e->lc_number != LC_FREE)
+			drbd_bm_mark_for_writeout(device,
+					al_extent_to_bm_page(e->lc_number));
+		i++;
+	}
+	spin_unlock_irq(&device->al_lock);
+	BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+
+	buffer->n_updates = cpu_to_be16(i);
+	for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+		buffer->update_slot_nr[i] = cpu_to_be16(-1);
+		buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+	}
+
+	buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+	buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+
+	mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+		   device->act_log->nr_elements - device->al_tr_cycle);
+	for (i = 0; i < mx; i++) {
+		unsigned idx = device->al_tr_cycle + i;
+		extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+		buffer->context[i] = cpu_to_be32(extent_nr);
+	}
+	for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+		buffer->context[i] = cpu_to_be32(LC_FREE);
+
+	device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+	if (device->al_tr_cycle >= device->act_log->nr_elements)
+		device->al_tr_cycle = 0;
+
+	sector = al_tr_number_to_on_disk_sector(device);
+
+	crc = crc32c(0, buffer, 4096);
+	buffer->crc32c = cpu_to_be32(crc);
+
+	if (drbd_bm_write_hinted(device))
+		err = -EIO;
+	else {
+		bool write_al_updates;
+		rcu_read_lock();
+		write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+		rcu_read_unlock();
+		if (write_al_updates) {
+			if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+				err = -EIO;
+				drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+			} else {
+				device->al_tr_number++;
+				device->al_writ_cnt++;
+			}
+		}
+	}
+
+	drbd_md_put_buffer(device);
+	put_ldev(device);
+
+	return err;
+}
+
+static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
+{
+	int rv;
+
+	spin_lock_irq(&device->al_lock);
+	rv = (al_ext->refcnt == 0);
+	if (likely(rv))
+		lc_del(device->act_log, al_ext);
+	spin_unlock_irq(&device->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @device:	DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock device->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_device *device)
+{
+	struct lc_element *al_ext;
+	int i;
+
+	D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags));
+
+	for (i = 0; i < device->act_log->nr_elements; i++) {
+		al_ext = lc_element_by_index(device->act_log, i);
+		if (al_ext->lc_number == LC_FREE)
+			continue;
+		wait_event(device->al_wait, _try_lc_del(device, al_ext));
+	}
+
+	wake_up(&device->al_wait);
+}
+
+int drbd_initialize_al(struct drbd_device *device, void *buffer)
+{
+	struct al_transaction_on_disk *al = buffer;
+	struct drbd_md *md = &device->ldev->md;
+	sector_t al_base = md->md_offset + md->al_offset;
+	int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
+	int i;
+
+	memset(al, 0, 4096);
+	al->magic = cpu_to_be32(DRBD_AL_MAGIC);
+	al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
+	al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+
+	for (i = 0; i < al_size_4k; i++) {
+		int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static const char *drbd_change_sync_fname[] = {
+	[RECORD_RS_FAILED] = "drbd_rs_failed_io",
+	[SET_IN_SYNC] = "drbd_set_in_sync",
+	[SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
+};
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
+ * potentially pulling in (and recounting the corresponding bits)
+ * this resync extent into the resync extent lru cache.
+ *
+ * Returns whether all bits have been cleared for this resync extent,
+ * precisely: (rs_left <= rs_failed)
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static bool update_rs_extent(struct drbd_device *device,
+		unsigned int enr, int count,
+		enum update_sync_bits_mode mode)
+{
+	struct lc_element *e;
+
+	D_ASSERT(device, atomic_read(&device->local_cnt));
+
+	/* When setting out-of-sync bits,
+	 * we don't need it cached (lc_find).
+	 * But if it is present in the cache,
+	 * we should update the cached bit count.
+	 * Otherwise, that extent should be in the resync extent lru cache
+	 * already -- or we want to pull it in if necessary -- (lc_get),
+	 * then update and check rs_left and rs_failed. */
+	if (mode == SET_OUT_OF_SYNC)
+		e = lc_find(device->resync, enr);
+	else
+		e = lc_get(device->resync, enr);
+	if (e) {
+		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+		if (ext->lce.lc_number == enr) {
+			if (mode == SET_IN_SYNC)
+				ext->rs_left -= count;
+			else if (mode == SET_OUT_OF_SYNC)
+				ext->rs_left += count;
+			else
+				ext->rs_failed += count;
+			if (ext->rs_left < ext->rs_failed) {
+				drbd_warn(device, "BAD! enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d cstate=%s\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->rs_failed, count,
+				     drbd_conn_str(device->state.conn));
+
+				/* We don't expect to be able to clear more bits
+				 * than have been set when we originally counted
+				 * the set bits to cache that value in ext->rs_left.
+				 * Whatever the reason (disconnect during resync,
+				 * delayed local completion of an application write),
+				 * try to fix it up by recounting here. */
+				ext->rs_left = drbd_bm_e_weight(device, enr);
+			}
+		} else {
+			/* Normally this element should be in the cache,
+			 * since drbd_rs_begin_io() pulled it already in.
+			 *
+			 * But maybe an application write finished, and we set
+			 * something outside the resync lru_cache in sync.
+			 */
+			int rs_left = drbd_bm_e_weight(device, enr);
+			if (ext->flags != 0) {
+				drbd_warn(device, "changing resync lce: %d[%u;%02lx]"
+				     " -> %d[%u;00]\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->flags, enr, rs_left);
+				ext->flags = 0;
+			}
+			if (ext->rs_failed) {
+				drbd_warn(device, "Kicking resync_lru element enr=%u "
+				     "out with rs_failed=%d\n",
+				     ext->lce.lc_number, ext->rs_failed);
+			}
+			ext->rs_left = rs_left;
+			ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
+			/* we don't keep a persistent log of the resync lru,
+			 * we can commit any change right away. */
+			lc_committed(device->resync);
+		}
+		if (mode != SET_OUT_OF_SYNC)
+			lc_put(device->resync, &ext->lce);
+		/* no race, we are within the al_lock! */
+
+		if (ext->rs_left <= ext->rs_failed) {
+			ext->rs_failed = 0;
+			return true;
+		}
+	} else if (mode != SET_OUT_OF_SYNC) {
+		/* be quiet if lc_find() did not find it. */
+		drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
+		    device->resync_locked,
+		    device->resync->nr_elements,
+		    device->resync->flags);
+	}
+	return false;
+}
+
+void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
+{
+	unsigned long now = jiffies;
+	unsigned long last = device->rs_mark_time[device->rs_last_mark];
+	int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+	if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+		if (device->rs_mark_left[device->rs_last_mark] != still_to_go &&
+		    device->state.conn != C_PAUSED_SYNC_T &&
+		    device->state.conn != C_PAUSED_SYNC_S) {
+			device->rs_mark_time[next] = now;
+			device->rs_mark_left[next] = still_to_go;
+			device->rs_last_mark = next;
+		}
+	}
+}
+
+/* It is called lazy update, so don't do write-out too often. */
+static bool lazy_bitmap_update_due(struct drbd_device *device)
+{
+	return time_after(jiffies, device->rs_last_bcast + 2*HZ);
+}
+
+static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
+{
+	if (rs_done)
+		set_bit(RS_DONE, &device->flags);
+		/* and also set RS_PROGRESS below */
+	else if (!lazy_bitmap_update_due(device))
+		return;
+
+	drbd_device_post_work(device, RS_PROGRESS);
+}
+
+static int update_sync_bits(struct drbd_device *device,
+		unsigned long sbnr, unsigned long ebnr,
+		enum update_sync_bits_mode mode)
+{
+	/*
+	 * We keep a count of set bits per resync-extent in the ->rs_left
+	 * caching member, so we need to loop and work within the resync extent
+	 * alignment. Typically this loop will execute exactly once.
+	 */
+	unsigned long flags;
+	unsigned long count = 0;
+	unsigned int cleared = 0;
+	while (sbnr <= ebnr) {
+		/* set temporary boundary bit number to last bit number within
+		 * the resync extent of the current start bit number,
+		 * but cap at provided end bit number */
+		unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
+		unsigned long c;
+
+		if (mode == RECORD_RS_FAILED)
+			/* Only called from drbd_rs_failed_io(), bits
+			 * supposedly still set.  Recount, maybe some
+			 * of the bits have been successfully cleared
+			 * by application IO meanwhile.
+			 */
+			c = drbd_bm_count_bits(device, sbnr, tbnr);
+		else if (mode == SET_IN_SYNC)
+			c = drbd_bm_clear_bits(device, sbnr, tbnr);
+		else /* if (mode == SET_OUT_OF_SYNC) */
+			c = drbd_bm_set_bits(device, sbnr, tbnr);
+
+		if (c) {
+			spin_lock_irqsave(&device->al_lock, flags);
+			cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
+			spin_unlock_irqrestore(&device->al_lock, flags);
+			count += c;
+		}
+		sbnr = tbnr + 1;
+	}
+	if (count) {
+		if (mode == SET_IN_SYNC) {
+			unsigned long still_to_go = drbd_bm_total_weight(device);
+			bool rs_is_done = (still_to_go <= device->rs_failed);
+			drbd_advance_rs_marks(device, still_to_go);
+			if (cleared || rs_is_done)
+				maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
+		} else if (mode == RECORD_RS_FAILED)
+			device->rs_failed += count;
+		wake_up(&device->al_wait);
+	}
+	return count;
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+		enum update_sync_bits_mode mode)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count = 0;
+	sector_t esector, nr_sectors;
+
+	/* This would be an empty REQ_FLUSH, be silent. */
+	if ((mode == SET_OUT_OF_SYNC) && size == 0)
+		return 0;
+
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+		drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
+				drbd_change_sync_fname[mode],
+				(unsigned long long)sector, size);
+		return 0;
+	}
+
+	if (!get_ldev(device))
+		return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
+
+	nr_sectors = drbd_get_capacity(device->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	if (!expect(sector < nr_sectors))
+		goto out;
+	if (!expect(esector < nr_sectors))
+		esector = nr_sectors - 1;
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	if (mode == SET_IN_SYNC) {
+		/* Round up start sector, round down end sector.  We make sure
+		 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
+		if (unlikely(esector < BM_SECT_PER_BIT-1))
+			goto out;
+		if (unlikely(esector == (nr_sectors-1)))
+			ebnr = lbnr;
+		else
+			ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+		sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+	} else {
+		/* We set it out of sync, or record resync failure.
+		 * Should not round anything here. */
+		sbnr = BM_SECT_TO_BIT(sector);
+		ebnr = BM_SECT_TO_BIT(esector);
+	}
+
+	count = update_sync_bits(device, sbnr, ebnr, mode);
+out:
+	put_ldev(device);
+	return count;
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int wakeup = 0;
+	unsigned long rs_flags;
+
+	spin_lock_irq(&device->al_lock);
+	if (device->resync_locked > device->resync->nr_elements/2) {
+		spin_unlock_irq(&device->al_lock);
+		return NULL;
+	}
+	e = lc_get(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+			bm_ext->rs_failed = 0;
+			lc_committed(device->resync);
+			wakeup = 1;
+		}
+		if (bm_ext->lce.refcnt == 1)
+			device->resync_locked++;
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+	}
+	rs_flags = device->resync->flags;
+	spin_unlock_irq(&device->al_lock);
+	if (wakeup)
+		wake_up(&device->al_wait);
+
+	if (!bm_ext) {
+		if (rs_flags & LC_STARVING)
+			drbd_warn(device, "Have to wait for element"
+			     " (resync LRU too small?)\n");
+		BUG_ON(rs_flags & LC_LOCKED);
+	}
+
+	return bm_ext;
+}
+
+static int _is_in_al(struct drbd_device *device, unsigned int enr)
+{
+	int rv;
+
+	spin_lock_irq(&device->al_lock);
+	rv = lc_is_used(device->act_log, enr);
+	spin_unlock_irq(&device->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @device:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct bm_extent *bm_ext;
+	int i, sig;
+	bool sa;
+
+retry:
+	sig = wait_event_interruptible(device->al_wait,
+			(bm_ext = _bme_get(device, enr)));
+	if (sig)
+		return -EINTR;
+
+	if (test_bit(BME_LOCKED, &bm_ext->flags))
+		return 0;
+
+	/* step aside only while we are above c-min-rate; unless disabled. */
+	sa = drbd_rs_c_min_rate_throttle(device);
+
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		sig = wait_event_interruptible(device->al_wait,
+					       !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
+					       (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));
+
+		if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
+			spin_lock_irq(&device->al_lock);
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
+				device->resync_locked--;
+				wake_up(&device->al_wait);
+			}
+			spin_unlock_irq(&device->al_lock);
+			if (sig)
+				return -EINTR;
+			if (schedule_timeout_interruptible(HZ/10))
+				return -EINTR;
+			goto retry;
+		}
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+	return 0;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @device:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+	bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+	/* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+	 * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+	 * need to throttle. There is at most one such half-locked extent,
+	 * which is remembered in resync_wenr. */
+
+	if (throttle && device->resync_wenr != enr)
+		return -EAGAIN;
+
+	spin_lock_irq(&device->al_lock);
+	if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
+		/* in case you have very heavy scattered io, it may
+		 * stall the syncer undefined if we give up the ref count
+		 * when we try again and requeue.
+		 *
+		 * if we don't give up the refcount, but the next time
+		 * we are scheduled this extent has been "synced" by new
+		 * application writes, we'd miss the lc_put on the
+		 * extent we keep the refcount on.
+		 * so we remembered which extent we had to try again, and
+		 * if the next requested one is something else, we do
+		 * the lc_put here...
+		 * we also have to wake_up
+		 */
+		e = lc_find(device->resync, device->resync_wenr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (bm_ext) {
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			device->resync_wenr = LC_FREE;
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0;
+				device->resync_locked--;
+			}
+			wake_up(&device->al_wait);
+		} else {
+			drbd_alert(device, "LOGIC BUG\n");
+		}
+	}
+	/* TRY. */
+	e = lc_try_get(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (test_bit(BME_LOCKED, &bm_ext->flags))
+			goto proceed;
+		if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			device->resync_locked++;
+		} else {
+			/* we did set the BME_NO_WRITES,
+			 * but then could not set BME_LOCKED,
+			 * so we tried again.
+			 * drop the extra reference. */
+			bm_ext->lce.refcnt--;
+			D_ASSERT(device, bm_ext->lce.refcnt > 0);
+		}
+		goto check_al;
+	} else {
+		/* do we rather want to try later? */
+		if (device->resync_locked > device->resync->nr_elements-3)
+			goto try_again;
+		/* Do or do not. There is no try. -- Yoda */
+		e = lc_get(device->resync, enr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (!bm_ext) {
+			const unsigned long rs_flags = device->resync->flags;
+			if (rs_flags & LC_STARVING)
+				drbd_warn(device, "Have to wait for element"
+				     " (resync LRU too small?)\n");
+			BUG_ON(rs_flags & LC_LOCKED);
+			goto try_again;
+		}
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+			bm_ext->rs_failed = 0;
+			lc_committed(device->resync);
+			wake_up(&device->al_wait);
+			D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+		}
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+		D_ASSERT(device, bm_ext->lce.refcnt == 1);
+		device->resync_locked++;
+		goto check_al;
+	}
+check_al:
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		if (lc_is_used(device->act_log, al_enr+i))
+			goto try_again;
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+	device->resync_wenr = LC_FREE;
+	spin_unlock_irq(&device->al_lock);
+	return 0;
+
+try_again:
+	if (bm_ext) {
+		if (throttle) {
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			device->resync_wenr = LC_FREE;
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0;
+				device->resync_locked--;
+			}
+			wake_up(&device->al_wait);
+		} else
+			device->resync_wenr = enr;
+	}
+	spin_unlock_irq(&device->al_lock);
+	return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->al_lock, flags);
+	e = lc_find(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (!bm_ext) {
+		spin_unlock_irqrestore(&device->al_lock, flags);
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
+		return;
+	}
+
+	if (bm_ext->lce.refcnt == 0) {
+		spin_unlock_irqrestore(&device->al_lock, flags);
+		drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, "
+		    "but refcnt is 0!?\n",
+		    (unsigned long long)sector, enr);
+		return;
+	}
+
+	if (lc_put(device->resync, &bm_ext->lce) == 0) {
+		bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
+		device->resync_locked--;
+		wake_up(&device->al_wait);
+	}
+
+	spin_unlock_irqrestore(&device->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @device:	DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_device *device)
+{
+	spin_lock_irq(&device->al_lock);
+
+	if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */
+		lc_reset(device->resync);
+		put_ldev(device);
+	}
+	device->resync_locked = 0;
+	device->resync_wenr = LC_FREE;
+	spin_unlock_irq(&device->al_lock);
+	wake_up(&device->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @device:	DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_device *device)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	spin_lock_irq(&device->al_lock);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		/* ok, ->resync is there. */
+		for (i = 0; i < device->resync->nr_elements; i++) {
+			e = lc_element_by_index(device->resync, i);
+			bm_ext = lc_entry(e, struct bm_extent, lce);
+			if (bm_ext->lce.lc_number == LC_FREE)
+				continue;
+			if (bm_ext->lce.lc_number == device->resync_wenr) {
+				drbd_info(device, "dropping %u in drbd_rs_del_all, apparently"
+				     " got 'synced' by application io\n",
+				     device->resync_wenr);
+				D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+				D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				device->resync_wenr = LC_FREE;
+				lc_put(device->resync, &bm_ext->lce);
+			}
+			if (bm_ext->lce.refcnt != 0) {
+				drbd_info(device, "Retrying drbd_rs_del_all() later. "
+				     "refcnt=%d\n", bm_ext->lce.refcnt);
+				put_ldev(device);
+				spin_unlock_irq(&device->al_lock);
+				return -EAGAIN;
+			}
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags));
+			lc_del(device->resync, &bm_ext->lce);
+		}
+		D_ASSERT(device, device->resync->used == 0);
+		put_ldev(device);
+	}
+	spin_unlock_irq(&device->al_lock);
+	wake_up(&device->al_wait);
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000..434c77dcc
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1648 @@
+/*
+   drbd_bitmap.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <linux/slab.h>
+#include <asm/kmap_types.h>
+
+#include "drbd_int.h"
+
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name      bm_... => internal to implementation, "private".
+ */
+
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50		bytes backend storage (1 PiB)
+ * 1 << (50 - 12)	bits needed
+ *	38 --> we need u64 to index and count bits
+ * 1 << (38 - 3)	bitmap bytes needed
+ *	35 --> we still need u64 to index and count bytes
+ *			(that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2)	32bit longs needed
+ *	33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3)	64bit longs needed
+ *	32 --> we could get away with a 32bit unsigned int to index and count
+ *	64bit long words, but I rather stay with unsigned long for now.
+ *	We probably should neither count nor point to bytes or long words
+ *	directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ *	22 --> we need that much 4KiB pages of bitmap.
+ *	1 << (22 + 3) --> on a 64bit arch,
+ *	we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
+
+ * bitmap storage and IO:
+ *	Bitmap is stored little endian on disk, and is kept little endian in
+ *	core memory. Currently we still hold the full bitmap in core as long
+ *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ *	seems excessive.
+ *
+ *	We plan to reduce the amount of in-core bitmap pages by paging them in
+ *	and out against their on-disk location as necessary, but need to make
+ *	sure we don't cause too much meta data IO, and must not deadlock in
+ *	tight memory situations. This needs some more work.
+ */
+
+/*
+ * NOTE
+ *  Access to the *bm_pages is protected by bm_lock.
+ *  It is safe to read the other members within the lock.
+ *
+ *  drbd_bm_set_bits is called from bio_endio callbacks,
+ *  We may be called with irq already disabled,
+ *  so we need spin_lock_irqsave().
+ *  And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+	struct page **bm_pages;
+	spinlock_t bm_lock;
+
+	/* see LIMITATIONS: above */
+
+	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
+	unsigned long bm_bits;
+	size_t   bm_words;
+	size_t   bm_number_of_pages;
+	sector_t bm_dev_capacity;
+	struct mutex bm_change; /* serializes resize operations */
+
+	wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
+
+	enum bm_flag bm_flags;
+
+	/* debugging aid, in case we are still racy somewhere */
+	char          *bm_why;
+	struct task_struct *bm_task;
+};
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_device *device, const char *func)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+	drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
+		 current->comm, task_pid_nr(current),
+		 func, b->bm_why ?: "?",
+		 b->bm_task->comm, task_pid_nr(b->bm_task));
+}
+
+void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	int trylock_failed;
+
+	if (!b) {
+		drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
+		return;
+	}
+
+	trylock_failed = !mutex_trylock(&b->bm_change);
+
+	if (trylock_failed) {
+		drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
+			  current->comm, task_pid_nr(current),
+			  why, b->bm_why ?: "?",
+			  b->bm_task->comm, task_pid_nr(b->bm_task));
+		mutex_lock(&b->bm_change);
+	}
+	if (BM_LOCKED_MASK & b->bm_flags)
+		drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
+	b->bm_flags |= flags & BM_LOCKED_MASK;
+
+	b->bm_why  = why;
+	b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!b) {
+		drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
+		return;
+	}
+
+	if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
+		drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
+
+	b->bm_flags &= ~BM_LOCKED_MASK;
+	b->bm_why  = NULL;
+	b->bm_task = NULL;
+	mutex_unlock(&b->bm_change);
+}
+
+/* we store some "meta" info about our pages in page->private */
+/* at a granularity of 4k storage per bitmap bit:
+ * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
+ *  1<<38 bits,
+ *  1<<23 4k bitmap pages.
+ * Use 24 bits as page index, covers 2 peta byte storage
+ * at a granularity of 4k per bit.
+ * Used to report the failed page idx on io error from the endio handlers.
+ */
+#define BM_PAGE_IDX_MASK	((1UL<<24)-1)
+/* this page is currently read in, or written back */
+#define BM_PAGE_IO_LOCK		31
+/* if there has been an IO error for this page */
+#define BM_PAGE_IO_ERROR	30
+/* this is to be able to intelligently skip disk IO,
+ * set if bits have been set since last IO. */
+#define BM_PAGE_NEED_WRITEOUT	29
+/* to mark for lazy writeout once syncer cleared all clearable bits,
+ * we if bits have been cleared since last IO. */
+#define BM_PAGE_LAZY_WRITEOUT	28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT	27
+
+/* store_page_idx uses non-atomic assignment. It is only used directly after
+ * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
+ * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
+ * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
+ * requires it all to be atomic as well. */
+static void bm_store_page_idx(struct page *page, unsigned long idx)
+{
+	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
+	set_page_private(page, idx);
+}
+
+static unsigned long bm_page_to_idx(struct page *page)
+{
+	return page_private(page) & BM_PAGE_IDX_MASK;
+}
+
+/* As is very unlikely that the same page is under IO from more than one
+ * context, we can get away with a bit per page and one wait queue per bitmap.
+ */
+static void bm_page_lock_io(struct drbd_device *device, int page_nr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
+}
+
+static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
+	wake_up(&device->bitmap->bm_io_wait);
+}
+
+/* set _before_ submit_io, so it may be reset due to being changed
+ * while this page is in flight... will get submitted later again */
+static void bm_set_page_unchanged(struct page *page)
+{
+	/* use cmpxchg? */
+	clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+	clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static void bm_set_page_need_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+}
+
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @device:	DRBD device.
+ * @page_nr:	the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
+{
+	struct page *page;
+	if (page_nr >= device->bitmap->bm_number_of_pages) {
+		drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
+			 page_nr, (int)device->bitmap->bm_number_of_pages);
+		return;
+	}
+	page = device->bitmap->bm_pages[page_nr];
+	set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_unchanged(struct page *page)
+{
+	volatile const unsigned long *addr = &page_private(page);
+	return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
+}
+
+static void bm_set_page_io_err(struct page *page)
+{
+	set_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_clear_page_io_err(struct page *page)
+{
+	clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_set_page_lazy_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_lazy_writeout(struct page *page)
+{
+	return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+/* on a 32bit box, this would allow for exactly (2<<38) bits. */
+static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
+{
+	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+	unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
+{
+	/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
+	unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+	struct page *page = b->bm_pages[idx];
+	return (unsigned long *) kmap_atomic(page);
+}
+
+static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+	return __bm_map_pidx(b, idx);
+}
+
+static void __bm_unmap(unsigned long *p_addr)
+{
+	kunmap_atomic(p_addr);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+	return __bm_unmap(p_addr);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_device*, but for the debug macros I like to have the device around
+ * to be able to report device specific.
+ */
+
+
+static void bm_free_pages(struct page **pages, unsigned long number)
+{
+	unsigned long i;
+	if (!pages)
+		return;
+
+	for (i = 0; i < number; i++) {
+		if (!pages[i]) {
+			pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
+				 i, number);
+			continue;
+		}
+		__free_page(pages[i]);
+		pages[i] = NULL;
+	}
+}
+
+static void bm_vk_free(void *ptr, int v)
+{
+	if (v)
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+	struct page **old_pages = b->bm_pages;
+	struct page **new_pages, *page;
+	unsigned int i, bytes, vmalloced = 0;
+	unsigned long have = b->bm_number_of_pages;
+
+	BUG_ON(have == 0 && old_pages != NULL);
+	BUG_ON(have != 0 && old_pages == NULL);
+
+	if (have == want)
+		return old_pages;
+
+	/* Trying kmalloc first, falling back to vmalloc.
+	 * GFP_NOIO, as this is called while drbd IO is "suspended",
+	 * and during resize or attach on diskless Primary,
+	 * we must not block on IO to ourselves.
+	 * Context is receiver thread or dmsetup. */
+	bytes = sizeof(struct page *)*want;
+	new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
+	if (!new_pages) {
+		new_pages = __vmalloc(bytes,
+				GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
+				PAGE_KERNEL);
+		if (!new_pages)
+			return NULL;
+		vmalloced = 1;
+	}
+
+	if (want >= have) {
+		for (i = 0; i < have; i++)
+			new_pages[i] = old_pages[i];
+		for (; i < want; i++) {
+			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+			if (!page) {
+				bm_free_pages(new_pages + have, i - have);
+				bm_vk_free(new_pages, vmalloced);
+				return NULL;
+			}
+			/* we want to know which page it is
+			 * from the endio handlers */
+			bm_store_page_idx(page, i);
+			new_pages[i] = page;
+		}
+	} else {
+		for (i = 0; i < want; i++)
+			new_pages[i] = old_pages[i];
+		/* NOT HERE, we are outside the spinlock!
+		bm_free_pages(old_pages + want, have - want);
+		*/
+	}
+
+	if (vmalloced)
+		b->bm_flags |= BM_P_VMALLOCED;
+	else
+		b->bm_flags &= ~BM_P_VMALLOCED;
+
+	return new_pages;
+}
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in device->bitmap.
+ */
+int drbd_bm_init(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	WARN_ON(b != NULL);
+	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	spin_lock_init(&b->bm_lock);
+	mutex_init(&b->bm_change);
+	init_waitqueue_head(&b->bm_io_wait);
+
+	device->bitmap = b;
+
+	return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_device *device)
+{
+	if (!expect(device->bitmap))
+		return 0;
+	return device->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_device *device)
+{
+	if (!expect(device->bitmap))
+		return;
+	bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
+	bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+	kfree(device->bitmap);
+	device->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
+#define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
+static int bm_clear_surplus(struct drbd_bitmap *b)
+{
+	unsigned long mask;
+	unsigned long *p_addr, *bm;
+	int tmp;
+	int cleared = 0;
+
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
+		cleared = hweight_long(*bm & ~mask);
+		*bm &= mask;
+		bm++;
+	}
+
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		cleared += hweight_long(*bm);
+		*bm = 0;
+	}
+	bm_unmap(p_addr);
+	return cleared;
+}
+
+static void bm_set_surplus(struct drbd_bitmap *b)
+{
+	unsigned long mask;
+	unsigned long *p_addr, *bm;
+	int tmp;
+
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
+		*bm |= ~mask;
+		bm++;
+	}
+
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		*bm = ~0UL;
+	}
+	bm_unmap(p_addr);
+}
+
+/* you better not modify the bitmap while this is running,
+ * or its results will be stale */
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+	unsigned long *p_addr;
+	unsigned long bits = 0;
+	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
+	int idx, i, last_word;
+
+	/* all but last page */
+	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
+		p_addr = __bm_map_pidx(b, idx);
+		for (i = 0; i < LWPP; i++)
+			bits += hweight_long(p_addr[i]);
+		__bm_unmap(p_addr);
+		cond_resched();
+	}
+	/* last (or only) page */
+	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
+	p_addr = __bm_map_pidx(b, idx);
+	for (i = 0; i < last_word; i++)
+		bits += hweight_long(p_addr[i]);
+	p_addr[last_word] &= cpu_to_lel(mask);
+	bits += hweight_long(p_addr[last_word]);
+	/* 32bit arch, may have an unused padding long */
+	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
+		p_addr[last_word+1] = 0;
+	__bm_unmap(p_addr);
+	return bits;
+}
+
+/* offset and len in long words.*/
+static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+	unsigned long *p_addr, *bm;
+	unsigned int idx;
+	size_t do_now, end;
+
+	end = offset + len;
+
+	if (end > b->bm_words) {
+		pr_alert("bm_memset end > bm_words\n");
+		return;
+	}
+
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
+		bm = p_addr + MLPP(offset);
+		if (bm+do_now > p_addr + LWPP) {
+			pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			       p_addr, bm, (int)do_now);
+		} else
+			memset(bm, c, do_now * sizeof(long));
+		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
+		offset += do_now;
+	}
+}
+
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+	u64 bitmap_sectors;
+	if (ldev->md.al_offset == 8)
+		bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+	else
+		bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+	return bitmap_sectors << (9 + 3);
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long bits, words, owords, obits;
+	unsigned long want, have, onpages; /* number of pages */
+	struct page **npages, **opages = NULL;
+	int err = 0, growing;
+	int opages_vmalloced;
+
+	if (!expect(b))
+		return -ENOMEM;
+
+	drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
+
+	drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
+			(unsigned long long)capacity);
+
+	if (capacity == b->bm_dev_capacity)
+		goto out;
+
+	opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
+
+	if (capacity == 0) {
+		spin_lock_irq(&b->bm_lock);
+		opages = b->bm_pages;
+		onpages = b->bm_number_of_pages;
+		owords = b->bm_words;
+		b->bm_pages = NULL;
+		b->bm_number_of_pages =
+		b->bm_set   =
+		b->bm_bits  =
+		b->bm_words =
+		b->bm_dev_capacity = 0;
+		spin_unlock_irq(&b->bm_lock);
+		bm_free_pages(opages, onpages);
+		bm_vk_free(opages, opages_vmalloced);
+		goto out;
+	}
+	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+	/* if we would use
+	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+	   a 32bit host could present the wrong number of words
+	   to a 64bit host.
+	*/
+	words = ALIGN(bits, 64) >> LN2_BPL;
+
+	if (get_ldev(device)) {
+		u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
+		put_ldev(device);
+		if (bits > bits_on_disk) {
+			drbd_info(device, "bits = %lu\n", bits);
+			drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
+			err = -ENOSPC;
+			goto out;
+		}
+	}
+
+	want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+	have = b->bm_number_of_pages;
+	if (want == have) {
+		D_ASSERT(device, b->bm_pages != NULL);
+		npages = b->bm_pages;
+	} else {
+		if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
+			npages = NULL;
+		else
+			npages = bm_realloc_pages(b, want);
+	}
+
+	if (!npages) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock_irq(&b->bm_lock);
+	opages = b->bm_pages;
+	owords = b->bm_words;
+	obits  = b->bm_bits;
+
+	growing = bits > obits;
+	if (opages && growing && set_new_bits)
+		bm_set_surplus(b);
+
+	b->bm_pages = npages;
+	b->bm_number_of_pages = want;
+	b->bm_bits  = bits;
+	b->bm_words = words;
+	b->bm_dev_capacity = capacity;
+
+	if (growing) {
+		if (set_new_bits) {
+			bm_memset(b, owords, 0xff, words-owords);
+			b->bm_set += bits - obits;
+		} else
+			bm_memset(b, owords, 0x00, words-owords);
+
+	}
+
+	if (want < have) {
+		/* implicit: (opages != NULL) && (opages != npages) */
+		bm_free_pages(opages + want, have - want);
+	}
+
+	(void)bm_clear_surplus(b);
+
+	spin_unlock_irq(&b->bm_lock);
+	if (opages != npages)
+		bm_vk_free(opages, opages_vmalloced);
+	if (!growing)
+		b->bm_set = bm_count_bits(b);
+	drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
+
+ out:
+	drbd_bm_unlock(device);
+	return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+unsigned long _drbd_bm_total_weight(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long s;
+	unsigned long flags;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	s = b->bm_set;
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+
+	return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_device *device)
+{
+	unsigned long s;
+	/* if I don't have a disk, I don't know about out-of-sync status */
+	if (!get_ldev_if_state(device, D_NEGOTIATING))
+		return 0;
+	s = _drbd_bm_total_weight(device);
+	put_ldev(device);
+	return s;
+}
+
+size_t drbd_bm_words(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return 0;
+
+	return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
+			unsigned long *buffer)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long word, bits;
+	unsigned int idx;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+	if (number == 0)
+		return;
+	WARN_ON(offset >= b->bm_words);
+	WARN_ON(end    >  b->bm_words);
+
+	spin_lock_irq(&b->bm_lock);
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
+		bm = p_addr + MLPP(offset);
+		offset += do_now;
+		while (do_now--) {
+			bits = hweight_long(*bm);
+			word = *bm | *buffer++;
+			*bm++ = word;
+			b->bm_set += hweight_long(word) - bits;
+		}
+		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
+	}
+	/* with 32bit <-> 64bit cross-platform connect
+	 * this is only correct for current usage,
+	 * where we _know_ that we are 64 bit aligned,
+	 * and know that this function is used in this way, too...
+	 */
+	if (end == b->bm_words)
+		b->bm_set -= bm_clear_surplus(b);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
+		     unsigned long *buffer)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr, *bm;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	if ((offset >= b->bm_words) ||
+	    (end    >  b->bm_words) ||
+	    (number <= 0))
+		drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
+			(unsigned long)	offset,
+			(unsigned long)	number,
+			(unsigned long) b->bm_words);
+	else {
+		while (offset < end) {
+			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+			p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
+			bm = p_addr + MLPP(offset);
+			offset += do_now;
+			while (do_now--)
+				*buffer++ = *bm++;
+			bm_unmap(p_addr);
+		}
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0xff, b->bm_words);
+	(void)bm_clear_surplus(b);
+	b->bm_set = b->bm_bits;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0, b->bm_words);
+	b->bm_set = 0;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
+{
+	struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+	list_del(&ctx->list);
+	spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
+	put_ldev(ctx->device);
+	kfree(ctx);
+}
+
+/* bv_page may be a copy, or may be the original */
+static void drbd_bm_endio(struct bio *bio, int error)
+{
+	struct drbd_bm_aio_ctx *ctx = bio->bi_private;
+	struct drbd_device *device = ctx->device;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+
+	/* strange behavior of some lower level drivers...
+	 * fail the request by clearing the uptodate flag,
+	 * but do not return any error?!
+	 * do we want to WARN() on this? */
+	if (!error && !uptodate)
+		error = -EIO;
+
+	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
+	    !bm_test_page_unchanged(b->bm_pages[idx]))
+		drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
+
+	if (error) {
+		/* ctx error will hold the completed-last non-zero error code,
+		 * in case error codes differ. */
+		ctx->error = error;
+		bm_set_page_io_err(b->bm_pages[idx]);
+		/* Not identical to on disk version of it.
+		 * Is BM_PAGE_IO_ERROR enough? */
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
+					error, idx);
+	} else {
+		bm_clear_page_io_err(b->bm_pages[idx]);
+		dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
+	}
+
+	bm_page_unlock_io(device, idx);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES)
+		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
+
+	bio_put(bio);
+
+	if (atomic_dec_and_test(&ctx->in_flight)) {
+		ctx->done = 1;
+		wake_up(&device->misc_wait);
+		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+	}
+}
+
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
+{
+	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
+	struct drbd_device *device = ctx->device;
+	struct drbd_bitmap *b = device->bitmap;
+	struct page *page;
+	unsigned int len;
+	unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
+
+	sector_t on_disk_sector =
+		device->ldev->md.md_offset + device->ldev->md.bm_offset;
+	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+
+	/* this might happen with very small
+	 * flexible external meta data device,
+	 * or with PAGE_SIZE > 4k */
+	len = min_t(unsigned int, PAGE_SIZE,
+		(drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
+
+	/* serialize IO on this page */
+	bm_page_lock_io(device, page_nr);
+	/* before memcpy and submit,
+	 * so it can be redirtied any time */
+	bm_set_page_unchanged(b->bm_pages[page_nr]);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES) {
+		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+		copy_highpage(page, b->bm_pages[page_nr]);
+		bm_store_page_idx(page, page_nr);
+	} else
+		page = b->bm_pages[page_nr];
+	bio->bi_bdev = device->ldev->md_bdev;
+	bio->bi_iter.bi_sector = on_disk_sector;
+	/* bio_add_page of a single page to an empty bio will always succeed,
+	 * according to api.  Do we want to assert that? */
+	bio_add_page(bio, page, len, 0);
+	bio->bi_private = ctx;
+	bio->bi_end_io = drbd_bm_endio;
+
+	if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio, -EIO);
+	} else {
+		submit_bio(rw, bio);
+		/* this should not count as user activity and cause the
+		 * resync to throttle -- see drbd_rs_should_slow_down(). */
+		atomic_add(len >> 9, &device->rs_sect_ev);
+	}
+}
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+{
+	struct drbd_bm_aio_ctx *ctx;
+	struct drbd_bitmap *b = device->bitmap;
+	int num_pages, i, count = 0;
+	unsigned long now;
+	char ppb[10];
+	int err = 0;
+
+	/*
+	 * We are protected against bitmap disappearing/resizing by holding an
+	 * ldev reference (caller must have called get_ldev()).
+	 * For read/write, we are protected against changes to the bitmap by
+	 * the bitmap lock (see drbd_bitmap_io).
+	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
+	 * as we submit copies of pages anyways.
+	 */
+
+	ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct drbd_bm_aio_ctx) {
+		.device = device,
+		.start_jif = jiffies,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = flags,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
+	};
+
+	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
+		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+	/* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
+	   drbd_adm_attach(), after device->ldev was assigned. */
+
+	if (0 == (ctx->flags & ~BM_AIO_READ))
+		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&ctx->list, &device->pending_bitmap_io);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	num_pages = b->bm_number_of_pages;
+
+	now = jiffies;
+
+	/* let the layers below us try to merge these bios... */
+	for (i = 0; i < num_pages; i++) {
+		/* ignore completely unchanged pages */
+		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+			break;
+		if (!(flags & BM_AIO_READ)) {
+			if ((flags & BM_AIO_WRITE_HINTED) &&
+			    !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+				    &page_private(b->bm_pages[i])))
+				continue;
+
+			if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
+			    bm_test_page_unchanged(b->bm_pages[i])) {
+				dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
+				continue;
+			}
+			/* during lazy writeout,
+			 * ignore those pages not marked for lazy writeout. */
+			if (lazy_writeout_upper_idx &&
+			    !bm_test_page_lazy_writeout(b->bm_pages[i])) {
+				dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
+				continue;
+			}
+		}
+		atomic_inc(&ctx->in_flight);
+		bm_page_io_async(ctx, i);
+		++count;
+		cond_resched();
+	}
+
+	/*
+	 * We initialize ctx->in_flight to one to make sure drbd_bm_endio
+	 * will not set ctx->done early, and decrement / test it here.  If there
+	 * are still some bios in flight, we need to wait for them here.
+	 * If all IO is done already (or nothing had been submitted), there is
+	 * no need to wait.  Still, we need to put the kref associated with the
+	 * "in_flight reached zero, all done" event.
+	 */
+	if (!atomic_dec_and_test(&ctx->in_flight))
+		wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
+	else
+		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+
+	/* summary for global bitmap IO */
+	if (flags == 0)
+		drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
+			 (flags & BM_AIO_READ) ? "READ" : "WRITE",
+			 count, jiffies - now);
+
+	if (ctx->error) {
+		drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
+		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+		err = -EIO; /* ctx->error ? */
+	}
+
+	if (atomic_read(&ctx->in_flight))
+		err = -EIO; /* Disk timeout/force-detach during IO... */
+
+	now = jiffies;
+	if (flags & BM_AIO_READ) {
+		b->bm_set = bm_count_bits(b);
+		drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
+		     jiffies - now);
+	}
+	now = b->bm_set;
+
+	if ((flags & ~BM_AIO_READ) == 0)
+		drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+		     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+	kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+	return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @device:	DRBD device.
+ */
+int drbd_bm_read(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_READ, 0);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ */
+int drbd_bm_write(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, 0, 0);
+}
+
+/**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @device:	DRBD device.
+ * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_COPY_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @device:	DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * For this to work on 32bit arch with bitnumbers > (1<<32),
+ * we'd need to return u64, and get a whole lot of other places
+ * fixed where we still use unsigned long.
+ *
+ * this returns a bit number, NOT a sector!
+ */
+static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
+	const int find_zero_bit)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr;
+	unsigned long bit_offset;
+	unsigned i;
+
+
+	if (bm_fo > b->bm_bits) {
+		drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+		bm_fo = DRBD_END_OF_BITMAP;
+	} else {
+		while (bm_fo < b->bm_bits) {
+			/* bit offset of the first bit in the page */
+			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
+			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
+
+			if (find_zero_bit)
+				i = find_next_zero_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+			else
+				i = find_next_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+
+			__bm_unmap(p_addr);
+			if (i < PAGE_SIZE*8) {
+				bm_fo = bit_offset + i;
+				if (bm_fo >= b->bm_bits)
+					break;
+				goto found;
+			}
+			bm_fo = bit_offset + PAGE_SIZE*8;
+		}
+		bm_fo = DRBD_END_OF_BITMAP;
+	}
+ found:
+	return bm_fo;
+}
+
+static unsigned long bm_find_next(struct drbd_device *device,
+	unsigned long bm_fo, const int find_zero_bit)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long i = DRBD_END_OF_BITMAP;
+
+	if (!expect(b))
+		return i;
+	if (!expect(b->bm_pages))
+		return i;
+
+	spin_lock_irq(&b->bm_lock);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+
+	i = __bm_find_next(device, bm_fo, find_zero_bit);
+
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+	return bm_find_next(device, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+	return bm_find_next(device, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+	return __bm_find_next(device, bm_fo, 0);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+	return __bm_find_next(device, bm_fo, 1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+	unsigned long e, int val)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned int last_page_nr = -1U;
+	int c = 0;
+	int changed_total = 0;
+
+	if (e >= b->bm_bits) {
+		drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+				s, e, b->bm_bits);
+		e = b->bm_bits ? b->bm_bits -1 : 0;
+	}
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != last_page_nr) {
+			if (p_addr)
+				__bm_unmap(p_addr);
+			if (c < 0)
+				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+			else if (c > 0)
+				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+			changed_total += c;
+			c = 0;
+			p_addr = __bm_map_pidx(b, page_nr);
+			last_page_nr = page_nr;
+		}
+		if (val)
+			c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+		else
+			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+	}
+	if (p_addr)
+		__bm_unmap(p_addr);
+	if (c < 0)
+		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+	else if (c > 0)
+		bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+	changed_total += c;
+	b->bm_set += changed_total;
+	return changed_total;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+	const unsigned long e, int val)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	int c = 0;
+
+	if (!expect(b))
+		return 1;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
+		bm_print_lock_info(device);
+
+	c = __bm_change_bits_to(device, s, e, val);
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	return bm_change_bits_to(device, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	return -bm_change_bits_to(device, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+		int page_nr, int first_word, int last_word)
+{
+	int i;
+	int bits;
+	int changed = 0;
+	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+	for (i = first_word; i < last_word; i++) {
+		bits = hweight_long(paddr[i]);
+		paddr[i] = ~0UL;
+		changed += BITS_PER_LONG - bits;
+	}
+	kunmap_atomic(paddr);
+	if (changed) {
+		/* We only need lazy writeout, the information is still in the
+		 * remote bitmap as well, and is reconstructed during the next
+		 * bitmap exchange, if lost locally due to a crash. */
+		bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+		b->bm_set += changed;
+	}
+}
+
+/* Same thing as drbd_bm_set_bits,
+ * but more efficient for a large bit range.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	/* First set_bit from the first bit (s)
+	 * up to the next long boundary (sl),
+	 * then assign full words up to the last long boundary (el),
+	 * then set_bit up to and including the last bit (e).
+	 *
+	 * Do not use memset, because we must account for changes,
+	 * so we need to loop over the words with hweight() anyways.
+	 */
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long sl = ALIGN(s,BITS_PER_LONG);
+	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+	int first_page;
+	int last_page;
+	int page_nr;
+	int first_word;
+	int last_word;
+
+	if (e - s <= 3*BITS_PER_LONG) {
+		/* don't bother; el and sl may even be wrong. */
+		spin_lock_irq(&b->bm_lock);
+		__bm_change_bits_to(device, s, e, 1);
+		spin_unlock_irq(&b->bm_lock);
+		return;
+	}
+
+	/* difference is large enough that we can trust sl and el */
+
+	spin_lock_irq(&b->bm_lock);
+
+	/* bits filling the current long */
+	if (sl)
+		__bm_change_bits_to(device, s, sl-1, 1);
+
+	first_page = sl >> (3 + PAGE_SHIFT);
+	last_page = el >> (3 + PAGE_SHIFT);
+
+	/* MLPP: modulo longs per page */
+	/* LWPP: long words per page */
+	first_word = MLPP(sl >> LN2_BPL);
+	last_word = LWPP;
+
+	/* first and full pages, unless first page == last page */
+	for (page_nr = first_page; page_nr < last_page; page_nr++) {
+		bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
+		spin_unlock_irq(&b->bm_lock);
+		cond_resched();
+		first_word = 0;
+		spin_lock_irq(&b->bm_lock);
+	}
+	/* last page (respectively only page, for first page == last page) */
+	last_word = MLPP(el >> LN2_BPL);
+
+	/* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
+	 * ==> e = 32767, el = 32768, last_page = 2,
+	 * and now last_word = 0.
+	 * We do not want to touch last_page in this case,
+	 * as we did not allocate it, it is not present in bitmap->bm_pages.
+	 */
+	if (last_word)
+		bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
+
+	/* possibly trailing bits.
+	 * example: (e & 63) == 63, el will be e+1.
+	 * if that even was the very last bit,
+	 * it would trigger an assert in __bm_change_bits_to()
+	 */
+	if (el <= e)
+		__bm_change_bits_to(device, el, e, 1);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ *  1 ... bit set
+ *  0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr;
+	int i;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+	if (bitnr < b->bm_bits) {
+		p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
+		i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
+		bm_unmap(p_addr);
+	} else if (bitnr == b->bm_bits) {
+		i = -1;
+	} else { /* (bitnr > b->bm_bits) */
+		drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+		i = 0;
+	}
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned int page_nr = -1U;
+	int c = 0;
+
+	/* If this is called without a bitmap, that is a bug.  But just to be
+	 * robust in case we screwed up elsewhere, in that case pretend there
+	 * was one dirty bit in the requested area, so we won't try to do a
+	 * local read there (no bitmap probably implies no disk) */
+	if (!expect(b))
+		return 1;
+	if (!expect(b->bm_pages))
+		return 1;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned int idx = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != idx) {
+			page_nr = idx;
+			if (p_addr)
+				bm_unmap(p_addr);
+			p_addr = bm_map_pidx(b, idx);
+		}
+		if (expect(bitnr < b->bm_bits))
+			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+		else
+			drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+	}
+	if (p_addr)
+		bm_unmap(p_addr);
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	int count, s, e;
+	unsigned long flags;
+	unsigned long *p_addr, *bm;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+
+	s = S2W(enr);
+	e = min((size_t)S2W(enr+1), b->bm_words);
+	count = 0;
+	if (s < b->bm_words) {
+		int n = e-s;
+		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
+		bm = p_addr + MLPP(s);
+		while (n--)
+			count += hweight_long(*bm++);
+		bm_unmap(p_addr);
+	} else {
+		drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+	}
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return count;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644
index 000000000..a6ee3d750
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -0,0 +1,958 @@
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+	if (valid)
+		seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+	else
+		seq_printf(m, "\t-");
+}
+
+static void __seq_print_rq_state_bit(struct seq_file *m,
+	bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+	if (is_set && set_name) {
+		seq_putc(m, *sep);
+		seq_puts(m, set_name);
+		*sep = '|';
+	} else if (!is_set && unset_name) {
+		seq_putc(m, *sep);
+		seq_puts(m, unset_name);
+		*sep = '|';
+	}
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+	bool is_set, char *sep, const char *set_name)
+{
+	__seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+	unsigned int s = req->rq_state;
+	char sep = ' ';
+	seq_printf(m, "\t0x%08x", s);
+	seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+	/* RQ_WRITE ignored, already reported */
+	seq_puts(m, "\tlocal:");
+	seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+	seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+	seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+	if (sep == ' ')
+		seq_puts(m, " -");
+
+	/* for_each_connection ... */
+	seq_printf(m, "\tnet:");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+	seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+	seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+	seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+	seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+	seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+	if (sep == ' ')
+		seq_puts(m, " -");
+
+	seq_printf(m, " :");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+	seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+	seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+	if (sep == ' ')
+		seq_puts(m, " -");
+	seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+	/* change anything here, fixup header below! */
+	unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+	seq_printf(m, "0x%x\t%llu\t%u\t%s",
+		req->epoch,
+		(unsigned long long)req->i.sector, req->i.size >> 9,
+		(s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+	seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+	seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+	seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+	seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+	seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+	seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+	seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+	seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+	seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		struct drbd_md_io tmp;
+		/* In theory this is racy,
+		 * in the sense that there could have been a
+		 * drbd_md_put_buffer(); drbd_md_get_buffer();
+		 * between accessing these members here.  */
+		tmp = device->md_io;
+		if (atomic_read(&tmp.in_use)) {
+			seq_printf(m, "%u\t%u\t%d\t",
+				device->minor, device->vnr,
+				jiffies_to_msecs(now - tmp.start_jif));
+			if (time_before(tmp.submit_jif, tmp.start_jif))
+				seq_puts(m, "-\t");
+			else
+				seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+			seq_printf(m, "%s\n", tmp.current_use);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		unsigned long jif;
+		struct drbd_request *req;
+		int n = atomic_read(&device->ap_actlog_cnt);
+		if (n) {
+			spin_lock_irq(&device->resource->req_lock);
+			req = list_first_entry_or_null(&device->pending_master_completion[1],
+				struct drbd_request, req_pending_master_completion);
+			/* if the oldest request does not wait for the activity log
+			 * it is not interesting for us here */
+			if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+				jif = req->start_jif;
+			else
+				req = NULL;
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+		if (n) {
+			seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+			if (req)
+				seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+			else
+				seq_puts(m, "-\t");
+			seq_printf(m, "%u\n", n);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+	struct drbd_bm_aio_ctx *ctx;
+	unsigned long start_jif;
+	unsigned int in_flight;
+	unsigned int flags;
+	spin_lock_irq(&device->resource->req_lock);
+	ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+	if (ctx && ctx->done)
+		ctx = NULL;
+	if (ctx) {
+		start_jif = ctx->start_jif;
+		in_flight = atomic_read(&ctx->in_flight);
+		flags = ctx->flags;
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+	if (ctx) {
+		seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+			device->minor, device->vnr,
+			(flags & BM_AIO_READ) ? 'R' : 'W',
+			jiffies_to_msecs(now - start_jif),
+			in_flight);
+	}
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		seq_print_device_bitmap_io(m, device, now);
+	}
+	rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+	unsigned long f = peer_req->flags;
+	char sep = ' ';
+
+	__seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+	__seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+	seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+	seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+	seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+
+	if (f & EE_IS_TRIM) {
+		seq_putc(m, sep);
+		sep = '|';
+		if (f & EE_IS_TRIM_USE_ZEROOUT)
+			seq_puts(m, "zero-out");
+		else
+			seq_puts(m, "trim");
+	}
+	seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+	struct drbd_device *device, struct list_head *lh,
+	unsigned long now)
+{
+	bool reported_preparing = false;
+	struct drbd_peer_request *peer_req;
+	list_for_each_entry(peer_req, lh, w.list) {
+		if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+			continue;
+
+		if (device)
+			seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+		seq_printf(m, "%llu\t%u\t%c\t%u\t",
+			(unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+			(peer_req->flags & EE_WRITE) ? 'W' : 'R',
+			jiffies_to_msecs(now - peer_req->submit_jif));
+		seq_print_peer_request_flags(m, peer_req);
+		if (peer_req->flags & EE_SUBMITTED)
+			break;
+		else
+			reported_preparing = true;
+	}
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+	struct drbd_device *device, unsigned long now)
+{
+	seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+	spin_lock_irq(&device->resource->req_lock);
+	seq_print_peer_request(m, device, &device->active_ee, now);
+	seq_print_peer_request(m, device, &device->read_ee, now);
+	seq_print_peer_request(m, device, &device->sync_ee, now);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (test_bit(FLUSH_PENDING, &device->flags)) {
+		seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+			device->minor, device->vnr,
+			jiffies_to_msecs(now - device->flush_jif));
+	}
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+	struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		seq_print_device_peer_requests(m, device, now);
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+	struct drbd_resource *resource,
+	struct drbd_connection *connection,
+	unsigned long now)
+{
+	struct drbd_request *req;
+	unsigned int count = 0;
+	unsigned int show_state = 0;
+
+	seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+	spin_lock_irq(&resource->req_lock);
+	list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+		unsigned int tmp = 0;
+		unsigned int s;
+		++count;
+
+		/* don't disable irq "forever" */
+		if (!(count & 0x1ff)) {
+			struct drbd_request *req_next;
+			kref_get(&req->kref);
+			spin_unlock_irq(&resource->req_lock);
+			cond_resched();
+			spin_lock_irq(&resource->req_lock);
+			req_next = list_next_entry(req, tl_requests);
+			if (kref_put(&req->kref, drbd_req_destroy))
+				req = req_next;
+			if (&req->tl_requests == &connection->transfer_log)
+				break;
+		}
+
+		s = req->rq_state;
+
+		/* This is meant to summarize timing issues, to be able to tell
+		 * local disk problems from network problems.
+		 * Skip requests, if we have shown an even older request with
+		 * similar aspects already.  */
+		if (req->master_bio == NULL)
+			tmp |= 1;
+		if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+			tmp |= 2;
+		if (s & RQ_NET_MASK) {
+			if (!(s & RQ_NET_SENT))
+				tmp |= 4;
+			if (s & RQ_NET_PENDING)
+				tmp |= 8;
+			if (!(s & RQ_NET_DONE))
+				tmp |= 16;
+		}
+		if ((tmp & show_state) == tmp)
+			continue;
+		show_state |= tmp;
+		seq_printf(m, "%u\t", count);
+		seq_print_minor_vnr_req(m, req, now);
+		if (show_state == 0x1f)
+			break;
+	}
+	spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+	struct drbd_resource *resource = m->private;
+	struct drbd_connection *connection;
+	unsigned long jif = jiffies;
+
+	connection = first_connection(resource);
+	/* This does not happen, actually.
+	 * But be robust and prepare for future code changes. */
+	if (!connection || !kref_get_unless_zero(&connection->kref))
+		return -ESTALE;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, "oldest bitmap IO\n");
+	seq_print_resource_pending_bitmap_io(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "meta data IO\n");
+	seq_print_resource_pending_meta_io(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "socket buffer stats\n");
+	/* for each connection ... once we have more than one */
+	rcu_read_lock();
+	if (connection->data.socket) {
+		/* open coded SIOCINQ, the "relevant" part */
+		struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+		int answ = tp->rcv_nxt - tp->copied_seq;
+		seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+		/* open coded SIOCOUTQ, the "relevant" part */
+		answ = tp->write_seq - tp->snd_una;
+		seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+	}
+	rcu_read_unlock();
+	seq_putc(m, '\n');
+
+	seq_puts(m, "oldest peer requests\n");
+	seq_print_resource_pending_peer_requests(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "application requests waiting for activity log\n");
+	seq_print_waiting_for_AL(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "oldest application requests\n");
+	seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+	seq_putc(m, '\n');
+
+	jif = jiffies - jif;
+	if (jif)
+		seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return 0;
+}
+
+/* simple_positive(file->f_path.dentry) respectively debugfs_positive(),
+ * but neither is "reachable" from here.
+ * So we have our own inline version of it above.  :-( */
+static inline int debugfs_positive(struct dentry *dentry)
+{
+        return d_really_is_positive(dentry) && !d_unhashed(dentry);
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+		                void *data, struct kref *kref,
+				void (*release)(struct kref *))
+{
+	struct dentry *parent;
+	int ret = -ESTALE;
+
+	/* Are we still linked,
+	 * or has debugfs_remove() already been called? */
+	parent = file->f_path.dentry->d_parent;
+	/* not sure if this can happen: */
+	if (!parent || d_really_is_negative(parent))
+		goto out;
+	/* serialize with d_delete() */
+	mutex_lock(&d_inode(parent)->i_mutex);
+	/* Make sure the object is still alive */
+	if (debugfs_positive(file->f_path.dentry)
+	&& kref_get_unless_zero(kref))
+		ret = 0;
+	mutex_unlock(&d_inode(parent)->i_mutex);
+	if (!ret) {
+		ret = single_open(file, show, data);
+		if (ret)
+			kref_put(kref, release);
+	}
+out:
+	return ret;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+	struct drbd_resource *resource = inode->i_private;
+	return drbd_single_open(file, in_flight_summary_show, resource,
+				&resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+	struct drbd_resource *resource = inode->i_private;
+	kref_put(&resource->kref, drbd_destroy_resource);
+	return single_release(inode, file);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+	.owner		= THIS_MODULE,
+	.open		= in_flight_summary_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= in_flight_summary_release,
+};
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+	struct dentry *dentry;
+	if (!drbd_debugfs_resources)
+		return;
+
+	dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res = dentry;
+
+	dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_volumes = dentry;
+
+	dentry = debugfs_create_dir("connections", resource->debugfs_res);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_connections = dentry;
+
+	dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
+			resource->debugfs_res, resource,
+			&in_flight_summary_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	resource->debugfs_res_in_flight_summary = dentry;
+	return;
+
+fail:
+	drbd_debugfs_resource_cleanup(resource);
+	drbd_err(resource, "failed to create debugfs dentry\n");
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+	debugfs_remove(*dp);
+	*dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+	/* it is ok to call debugfs_remove(NULL) */
+	drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+	drbd_debugfs_remove(&resource->debugfs_res_connections);
+	drbd_debugfs_remove(&resource->debugfs_res_volumes);
+	drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+static void seq_print_one_timing_detail(struct seq_file *m,
+	const struct drbd_thread_timing_details *tdp,
+	unsigned long now)
+{
+	struct drbd_thread_timing_details td;
+	/* No locking...
+	 * use temporary assignment to get at consistent data. */
+	do {
+		td = *tdp;
+	} while (td.cb_nr != tdp->cb_nr);
+	if (!td.cb_addr)
+		return;
+	seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+			td.cb_nr,
+			jiffies_to_msecs(now - td.start_jif),
+			td.caller_fn, td.line,
+			td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+		const char *title,
+		unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+	unsigned int start_idx;
+	unsigned int i;
+
+	seq_printf(m, "%s\n", title);
+	/* If not much is going on, this will result in natural ordering.
+	 * If it is very busy, we will possibly skip events, or even see wrap
+	 * arounds, which could only be avoided with locking.
+	 */
+	start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+	for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+		seq_print_one_timing_detail(m, tdp+i, now);
+	for (i = 0; i < start_idx; i++)
+		seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_connection *connection = m->private;
+	unsigned long jif = jiffies;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, "n\tage\tcallsite\tfn\n");
+	seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+	seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+	return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	return drbd_single_open(file, callback_history_show, connection,
+				&connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+	.owner		= THIS_MODULE,
+	.open		= callback_history_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= callback_history_release,
+};
+
+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_connection *connection = m->private;
+	unsigned long now = jiffies;
+	struct drbd_request *r1, *r2;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	spin_lock_irq(&connection->resource->req_lock);
+	r1 = connection->req_next;
+	if (r1)
+		seq_print_minor_vnr_req(m, r1, now);
+	r2 = connection->req_ack_pending;
+	if (r2 && r2 != r1) {
+		r1 = r2;
+		seq_print_minor_vnr_req(m, r1, now);
+	}
+	r2 = connection->req_not_net_done;
+	if (r2 && r2 != r1)
+		seq_print_minor_vnr_req(m, r2, now);
+	spin_unlock_irq(&connection->resource->req_lock);
+	return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	return drbd_single_open(file, connection_oldest_requests_show, connection,
+				&connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+	.owner		= THIS_MODULE,
+	.open		= connection_oldest_requests_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= connection_oldest_requests_release,
+};
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+	struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+	struct dentry *dentry;
+	if (!conns_dir)
+		return;
+
+	/* Once we enable mutliple peers,
+	 * these connections will have descriptive names.
+	 * For now, it is just the one connection to the (only) "peer". */
+	dentry = debugfs_create_dir("peer", conns_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn = dentry;
+
+	dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
+			connection->debugfs_conn, connection,
+			&connection_callback_history_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn_callback_history = dentry;
+
+	dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
+			connection->debugfs_conn, connection,
+			&connection_oldest_requests_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	connection->debugfs_conn_oldest_requests = dentry;
+	return;
+
+fail:
+	drbd_debugfs_connection_cleanup(connection);
+	drbd_err(connection, "failed to create debugfs dentry\n");
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+	drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+	drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+	drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+       struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+       seq_printf(m, "%5d %s %s %s", bme->rs_left,
+		  test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+		  test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+		  test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+		  );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		lc_seq_printf_stats(m, device->resync);
+		lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+		put_ldev(device);
+	}
+	return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		lc_seq_printf_stats(m, device->act_log);
+		lc_seq_dump_details(m, device->act_log, "", NULL);
+		put_ldev(device);
+	}
+	return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	struct drbd_resource *resource = device->resource;
+	unsigned long now = jiffies;
+	struct drbd_request *r1, *r2;
+	int i;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, RQ_HDR);
+	spin_lock_irq(&resource->req_lock);
+	/* WRITE, then READ */
+	for (i = 1; i >= 0; --i) {
+		r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+			struct drbd_request, req_pending_master_completion);
+		r2 = list_first_entry_or_null(&device->pending_completion[i],
+			struct drbd_request, req_pending_local);
+		if (r1)
+			seq_print_one_request(m, r1, now);
+		if (r2 && r2 != r1)
+			seq_print_one_request(m, r2, now);
+	}
+	spin_unlock_irq(&resource->req_lock);
+	return 0;
+}
+
+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	struct drbd_md *md;
+	enum drbd_uuid_index idx;
+
+	if (!get_ldev_if_state(device, D_FAILED))
+		return -ENODEV;
+
+	md = &device->ldev->md;
+	spin_lock_irq(&md->uuid_lock);
+	for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+		seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+	}
+	spin_unlock_irq(&md->uuid_lock);
+	put_ldev(device);
+	return 0;
+}
+
+#define drbd_debugfs_device_attr(name)						\
+static int device_ ## name ## _open(struct inode *inode, struct file *file)	\
+{										\
+	struct drbd_device *device = inode->i_private;				\
+	return drbd_single_open(file, device_ ## name ## _show, device,		\
+				&device->kref, drbd_destroy_device);		\
+}										\
+static int device_ ## name ## _release(struct inode *inode, struct file *file)	\
+{										\
+	struct drbd_device *device = inode->i_private;				\
+	kref_put(&device->kref, drbd_destroy_device);				\
+	return single_release(inode, file);					\
+}										\
+static const struct file_operations device_ ## name ## _fops = {		\
+	.owner		= THIS_MODULE,						\
+	.open		= device_ ## name ## _open,				\
+	.read		= seq_read,						\
+	.llseek		= seq_lseek,						\
+	.release	= device_ ## name ## _release,				\
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+	struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+	char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+	char vnr_buf[8];   /* volume number vnr is even 16 bit only; */
+	char *slink_name = NULL;
+
+	struct dentry *dentry;
+	if (!vols_dir || !drbd_debugfs_minors)
+		return;
+
+	snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+	dentry = debugfs_create_dir(vnr_buf, vols_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	device->debugfs_vol = dentry;
+
+	snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+	slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+			device->resource->name, device->vnr);
+	if (!slink_name)
+		goto fail;
+	dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+	kfree(slink_name);
+	slink_name = NULL;
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	device->debugfs_minor = dentry;
+
+#define DCF(name)	do {					\
+	dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP,	\
+			device->debugfs_vol, device,		\
+			&device_ ## name ## _fops);		\
+	if (IS_ERR_OR_NULL(dentry))				\
+		goto fail;					\
+	device->debugfs_vol_ ## name = dentry;			\
+	} while (0)
+
+	DCF(oldest_requests);
+	DCF(act_log_extents);
+	DCF(resync_extents);
+	DCF(data_gen_id);
+#undef DCF
+	return;
+
+fail:
+	drbd_debugfs_device_cleanup(device);
+	drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+	drbd_debugfs_remove(&device->debugfs_minor);
+	drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+	drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+	drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+	drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+	drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+	struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+	struct dentry *dentry;
+	char vnr_buf[8];
+
+	if (!conn_dir)
+		return;
+
+	snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+	dentry = debugfs_create_dir(vnr_buf, conn_dir);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	peer_device->debugfs_peer_dev = dentry;
+	return;
+
+fail:
+	drbd_debugfs_peer_device_cleanup(peer_device);
+	drbd_err(peer_device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+	drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+	seq_printf(m, "# %s\n", drbd_buildtag());
+	seq_printf(m, "VERSION=%s\n", REL_VERSION);
+	seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+	seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+	seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+	return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, drbd_version_show, NULL);
+}
+
+static struct file_operations drbd_version_fops = {
+	.owner = THIS_MODULE,
+	.open = drbd_version_open,
+	.llseek = seq_lseek,
+	.read = seq_read,
+	.release = single_release,
+};
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+	drbd_debugfs_remove(&drbd_debugfs_resources);
+	drbd_debugfs_remove(&drbd_debugfs_minors);
+	drbd_debugfs_remove(&drbd_debugfs_version);
+	drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+int __init drbd_debugfs_init(void)
+{
+	struct dentry *dentry;
+
+	dentry = debugfs_create_dir("drbd", NULL);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_root = dentry;
+
+	dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_version = dentry;
+
+	dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_resources = dentry;
+
+	dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+	if (IS_ERR_OR_NULL(dentry))
+		goto fail;
+	drbd_debugfs_minors = dentry;
+	return 0;
+
+fail:
+	drbd_debugfs_cleanup();
+	if (dentry)
+		return PTR_ERR(dentry);
+	else
+		return -EINVAL;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644
index 000000000..8bee21340
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+int __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000..b905e9888
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2328 @@
+/*
+  drbd_int.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/ratelimit.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+#include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+#include "drbd_state.h"
+#include "drbd_protocol.h"
+
+#ifdef __CHECKER__
+# define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x)  __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+# define __must_hold(x)       __attribute__((context(x,1,1), require_context(x,1,999,"call")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+# define __must_hold(x)
+#endif
+
+/* module parameter, defined in drbd_main.c */
+extern unsigned int minor_count;
+extern bool disable_sendpage;
+extern bool allow_oos;
+void tl_abort_disk_io(struct drbd_device *device);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern int enable_faults;
+extern int fault_rate;
+extern int fault_devs;
+#endif
+
+extern char usermode_helper[];
+
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+#define ID_IN_SYNC      (4711ULL)
+#define ID_OUT_OF_SYNC  (4712ULL)
+#define ID_SYNCER (-1ULL)
+
+#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
+
+struct drbd_device;
+struct drbd_connection;
+
+#define __drbd_printk_device(level, device, fmt, args...) \
+	dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
+#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
+	dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
+#define __drbd_printk_resource(level, resource, fmt, args...) \
+	printk(level "drbd %s: " fmt, (resource)->name, ## args)
+#define __drbd_printk_connection(level, connection, fmt, args...) \
+	printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
+
+void drbd_printk_with_wrong_object_type(void);
+
+#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
+	(__builtin_types_compatible_p(typeof(obj), type) || \
+	 __builtin_types_compatible_p(typeof(obj), const type)), \
+	func(level, (const type)(obj), fmt, ## args)
+
+#define drbd_printk(level, obj, fmt, args...) \
+	__builtin_choose_expr( \
+	  __drbd_printk_if_same_type(obj, struct drbd_device *, \
+			     __drbd_printk_device, level, fmt, ## args), \
+	  __builtin_choose_expr( \
+	    __drbd_printk_if_same_type(obj, struct drbd_resource *, \
+			       __drbd_printk_resource, level, fmt, ## args), \
+	    __builtin_choose_expr( \
+	      __drbd_printk_if_same_type(obj, struct drbd_connection *, \
+				 __drbd_printk_connection, level, fmt, ## args), \
+	      __builtin_choose_expr( \
+		__drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
+				 __drbd_printk_peer_device, level, fmt, ## args), \
+		drbd_printk_with_wrong_object_type()))))
+
+#define drbd_dbg(obj, fmt, args...) \
+	drbd_printk(KERN_DEBUG, obj, fmt, ## args)
+#define drbd_alert(obj, fmt, args...) \
+	drbd_printk(KERN_ALERT, obj, fmt, ## args)
+#define drbd_err(obj, fmt, args...) \
+	drbd_printk(KERN_ERR, obj, fmt, ## args)
+#define drbd_warn(obj, fmt, args...) \
+	drbd_printk(KERN_WARNING, obj, fmt, ## args)
+#define drbd_info(obj, fmt, args...) \
+	drbd_printk(KERN_INFO, obj, fmt, ## args)
+#define drbd_emerg(obj, fmt, args...) \
+	drbd_printk(KERN_EMERG, obj, fmt, ## args)
+
+#define dynamic_drbd_dbg(device, fmt, args...) \
+	dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
+
+#define D_ASSERT(device, exp)	do { \
+	if (!(exp)) \
+		drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
+	} while (0)
+
+/**
+ * expect  -  Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({								\
+		bool _bool = (exp);						\
+		if (!_bool)							\
+			drbd_err(device, "ASSERTION %s FAILED in %s\n",		\
+			        #exp, __func__);				\
+		_bool;								\
+		})
+
+/* Defines to control fault insertion */
+enum {
+	DRBD_FAULT_MD_WR = 0,	/* meta data write */
+	DRBD_FAULT_MD_RD = 1,	/*           read  */
+	DRBD_FAULT_RS_WR = 2,	/* resync          */
+	DRBD_FAULT_RS_RD = 3,
+	DRBD_FAULT_DT_WR = 4,	/* data            */
+	DRBD_FAULT_DT_RD = 5,
+	DRBD_FAULT_DT_RA = 6,	/* data read ahead */
+	DRBD_FAULT_BM_ALLOC = 7,	/* bitmap allocation */
+	DRBD_FAULT_AL_EE = 8,	/* alloc ee */
+	DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
+
+	DRBD_FAULT_MAX,
+};
+
+extern unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type);
+
+static inline int
+drbd_insert_fault(struct drbd_device *device, unsigned int type) {
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+	return fault_rate &&
+		(enable_faults & (1<<type)) &&
+		_drbd_insert_fault(device, type);
+#else
+	return 0;
+#endif
+}
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr drbd_devices; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_resources; /* RCU, updates: genl_lock() */
+
+extern const char *cmdname(enum drbd_packet cmd);
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+	/* "const"
+	 * stores total bits and long words
+	 * of the bitmap, so we don't need to
+	 * call the accessor functions over and again. */
+	unsigned long bm_bits;
+	unsigned long bm_words;
+	/* during xfer, current position within the bitmap */
+	unsigned long bit_offset;
+	unsigned long word_offset;
+
+	/* statistics; index: (h->command == P_BITMAP) */
+	unsigned packets[2];
+	unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_device *device,
+		const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+	/* word_offset counts "native long words" (32 or 64 bit),
+	 * aligned at 64 bit.
+	 * Encoded packet may end at an unaligned bit offset.
+	 * In case a fallback clear text packet is transmitted in
+	 * between, we adjust this offset back to the last 64bit
+	 * aligned "native long word", which makes coding and decoding
+	 * the plain text bitmap much more convenient.  */
+#if BITS_PER_LONG == 64
+	c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+	c->word_offset = c->bit_offset >> 5;
+	c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+extern unsigned int drbd_header_size(struct drbd_connection *connection);
+
+/**********************************************************************/
+enum drbd_thread_state {
+	NONE,
+	RUNNING,
+	EXITING,
+	RESTARTING
+};
+
+struct drbd_thread {
+	spinlock_t t_lock;
+	struct task_struct *task;
+	struct completion stop;
+	enum drbd_thread_state t_state;
+	int (*function) (struct drbd_thread *);
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	int reset_cpu_mask;
+	const char *name;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+	/* THINK testing the t_state seems to be uncritical in all cases
+	 * (but thread_{start,stop}), so we can read it *without* the lock.
+	 *	--lge */
+
+	smp_rmb();
+	return thi->t_state;
+}
+
+struct drbd_work {
+	struct list_head list;
+	int (*cb)(struct drbd_work *, int cancel);
+};
+
+struct drbd_device_work {
+	struct drbd_work w;
+	struct drbd_device *device;
+};
+
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
+
+struct drbd_request {
+	struct drbd_work w;
+	struct drbd_device *device;
+
+	/* if local IO is not allowed, will be NULL.
+	 * if local IO _is_ allowed, holds the locally submitted bio clone,
+	 * or, after local IO completion, the ERR_PTR(error).
+	 * see drbd_request_endio(). */
+	struct bio *private_bio;
+
+	struct drbd_interval i;
+
+	/* epoch: used to check on "completion" whether this req was in
+	 * the current epoch, and we therefore have to close it,
+	 * causing a p_barrier packet to be send, starting a new epoch.
+	 *
+	 * This corresponds to "barrier" in struct p_barrier[_ack],
+	 * and to "barrier_nr" in struct drbd_epoch (and various
+	 * comments/function parameters/local variable names).
+	 */
+	unsigned int epoch;
+
+	struct list_head tl_requests; /* ring list in the transfer log */
+	struct bio *master_bio;       /* master bio pointer */
+
+	/* see struct drbd_device */
+	struct list_head req_pending_master_completion;
+	struct list_head req_pending_local;
+
+	/* for generic IO accounting */
+	unsigned long start_jif;
+
+	/* for DRBD internal statistics */
+
+	/* Minimal set of time stamps to determine if we wait for activity log
+	 * transactions, local disk or peer.  32 bit "jiffies" are good enough,
+	 * we don't expect a DRBD request to be stalled for several month.
+	 */
+
+	/* before actual request processing */
+	unsigned long in_actlog_jif;
+
+	/* local disk */
+	unsigned long pre_submit_jif;
+
+	/* per connection */
+	unsigned long pre_send_jif;
+	unsigned long acked_jif;
+	unsigned long net_done_jif;
+
+	/* Possibly even more detail to track each phase:
+	 *  master_completion_jif
+	 *      how long did it take to complete the master bio
+	 *      (application visible latency)
+	 *  allocated_jif
+	 *      how long the master bio was blocked until we finally allocated
+	 *      a tracking struct
+	 *  in_actlog_jif
+	 *      how long did we wait for activity log transactions
+	 *
+	 *  net_queued_jif
+	 *      when did we finally queue it for sending
+	 *  pre_send_jif
+	 *      when did we start sending it
+	 *  post_send_jif
+	 *      how long did we block in the network stack trying to send it
+	 *  acked_jif
+	 *      when did we receive (or fake, in protocol A) a remote ACK
+	 *  net_done_jif
+	 *      when did we receive final acknowledgement (P_BARRIER_ACK),
+	 *      or decide, e.g. on connection loss, that we do no longer expect
+	 *      anything from this peer for this request.
+	 *
+	 *  pre_submit_jif
+	 *  post_sub_jif
+	 *      when did we start submiting to the lower level device,
+	 *      and how long did we block in that submit function
+	 *  local_completion_jif
+	 *      how long did it take the lower level device to complete this request
+	 */
+
+
+	/* once it hits 0, we may complete the master_bio */
+	atomic_t completion_ref;
+	/* once it hits 0, we may destroy this drbd_request object */
+	struct kref kref;
+
+	unsigned rq_state; /* see comments above _req_mod() */
+};
+
+struct drbd_epoch {
+	struct drbd_connection *connection;
+	struct list_head list;
+	unsigned int barrier_nr;
+	atomic_t epoch_size; /* increased on every request added. */
+	atomic_t active;     /* increased on every req. added, and dec on every finished. */
+	unsigned long flags;
+};
+
+/* Prototype declaration of function defined in drbd_receiver.c */
+int drbdd_init(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+/* drbd_epoch flag bits */
+enum {
+	DE_HAVE_BARRIER_NUMBER,
+};
+
+enum epoch_event {
+	EV_PUT,
+	EV_GOT_BARRIER_NR,
+	EV_BECAME_LAST,
+	EV_CLEANUP = 32, /* used as flag */
+};
+
+struct digest_info {
+	int digest_size;
+	void *digest;
+};
+
+struct drbd_peer_request {
+	struct drbd_work w;
+	struct drbd_peer_device *peer_device;
+	struct drbd_epoch *epoch; /* for writes */
+	struct page *pages;
+	atomic_t pending_bios;
+	struct drbd_interval i;
+	/* see comments on ee flag bits below */
+	unsigned long flags;
+	unsigned long submit_jif;
+	union {
+		u64 block_id;
+		struct digest_info *digest;
+	};
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
+enum {
+	__EE_CALL_AL_COMPLETE_IO,
+	__EE_MAY_SET_IN_SYNC,
+
+	/* is this a TRIM aka REQ_DISCARD? */
+	__EE_IS_TRIM,
+	/* our lower level cannot handle trim,
+	 * and we want to fall back to zeroout instead */
+	__EE_IS_TRIM_USE_ZEROOUT,
+
+	/* In case a barrier failed,
+	 * we need to resubmit without the barrier flag. */
+	__EE_RESUBMITTED,
+
+	/* we may have several bios per peer request.
+	 * if any of those fail, we set this flag atomically
+	 * from the endio callback */
+	__EE_WAS_ERROR,
+
+	/* This ee has a pointer to a digest instead of a block id */
+	__EE_HAS_DIGEST,
+
+	/* Conflicting local requests need to be restarted after this request */
+	__EE_RESTART_REQUESTS,
+
+	/* The peer wants a write ACK for this (wire proto C) */
+	__EE_SEND_WRITE_ACK,
+
+	/* Is set when net_conf had two_primaries set while creating this peer_req */
+	__EE_IN_INTERVAL_TREE,
+
+	/* for debugfs: */
+	/* has this been submitted, or does it still wait for something else? */
+	__EE_SUBMITTED,
+
+	/* this is/was a write request */
+	__EE_WRITE,
+
+	/* this originates from application on peer
+	 * (not some resync or verify or other DRBD internal request) */
+	__EE_APPLICATION,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_TRIM             (1<<__EE_IS_TRIM)
+#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
+#define EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS	(1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK	(1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE	(1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED		(1<<__EE_SUBMITTED)
+#define EE_WRITE		(1<<__EE_WRITE)
+#define EE_APPLICATION		(1<<__EE_APPLICATION)
+
+/* flag bits per device */
+enum {
+	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
+	MD_DIRTY,		/* current uuids and flags not yet on disk */
+	USE_DEGR_WFC_T,		/* degr-wfc-timeout instead of wfc-timeout. */
+	CL_ST_CHG_SUCCESS,
+	CL_ST_CHG_FAIL,
+	CRASHED_PRIMARY,	/* This node was a crashed primary.
+				 * Gets cleared when the state.conn
+				 * goes into C_CONNECTED state. */
+	CONSIDER_RESYNC,
+
+	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
+
+	SUSPEND_IO,		/* suspend application io */
+	BITMAP_IO,		/* suspend application io;
+				   once no more io in flight, start bitmap io */
+	BITMAP_IO_QUEUED,       /* Started bitmap IO */
+	WAS_IO_ERROR,		/* Local disk failed, returned IO error */
+	WAS_READ_ERROR,		/* Local disk READ failed (set additionally to the above) */
+	FORCE_DETACH,		/* Force-detach from local disk, aborting any pending local IO */
+	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
+	RESIZE_PENDING,		/* Size change detected locally, waiting for the response from
+				 * the peer, if it changed there as well. */
+	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
+	AL_SUSPENDED,		/* Activity logging is currently suspended. */
+	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
+	B_RS_H_DONE,		/* Before resync handler done (already executed) */
+	DISCARD_MY_DATA,	/* discard_my_data flag per volume */
+	READ_BALANCE_RR,
+
+	FLUSH_PENDING,		/* if set, device->flush_jif is when we submitted that flush
+				 * from drbd_flush_after_epoch() */
+
+	/* cleared only after backing device related structures have been destroyed. */
+	GOING_DISKLESS,		/* Disk is being detached, because of io-error, or admin request. */
+
+	/* to be used in drbd_device_post_work() */
+	GO_DISKLESS,		/* tell worker to schedule cleanup before detach */
+	DESTROY_DISK,		/* tell worker to close backing devices and destroy related structures. */
+	MD_SYNC,		/* tell worker to call drbd_md_sync() */
+	RS_START,		/* tell worker to start resync/OV */
+	RS_PROGRESS,		/* tell worker that resync made significant progress */
+	RS_DONE,		/* tell worker that resync is done */
+};
+
+struct drbd_bitmap; /* opaque for drbd_device */
+
+/* definition of bits in bm_flags to be used in drbd_bm_lock
+ * and drbd_bitmap_io and friends. */
+enum bm_flag {
+	/* do we need to kfree, or vfree bm_pages? */
+	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
+
+	/* currently locked for bulk operation */
+	BM_LOCKED_MASK = 0xf,
+
+	/* in detail, that is: */
+	BM_DONT_CLEAR = 0x1,
+	BM_DONT_SET   = 0x2,
+	BM_DONT_TEST  = 0x4,
+
+	/* so we can mark it locked for bulk operation,
+	 * and still allow all non-bulk operations */
+	BM_IS_LOCKED  = 0x8,
+
+	/* (test bit, count bit) allowed (common case) */
+	BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
+
+	/* testing bits, as well as setting new bits allowed, but clearing bits
+	 * would be unexpected.  Used during bitmap receive.  Setting new bits
+	 * requires sending of "out-of-sync" information, though. */
+	BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
+
+	/* for drbd_bm_write_copy_pages, everything is allowed,
+	 * only concurrent bulk operations are locked out. */
+	BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
+};
+
+struct drbd_work_queue {
+	struct list_head q;
+	spinlock_t q_lock;  /* to protect the list. */
+	wait_queue_head_t q_wait;
+};
+
+struct drbd_socket {
+	struct mutex mutex;
+	struct socket    *socket;
+	/* this way we get our
+	 * send/receive buffers off the stack */
+	void *sbuf;
+	void *rbuf;
+};
+
+struct drbd_md {
+	u64 md_offset;		/* sector offset to 'super' block */
+
+	u64 la_size_sect;	/* last agreed size, unit sectors */
+	spinlock_t uuid_lock;
+	u64 uuid[UI_SIZE];
+	u64 device_uuid;
+	u32 flags;
+	u32 md_size_sect;
+
+	s32 al_offset;	/* signed relative sector offset to activity log */
+	s32 bm_offset;	/* signed relative sector offset to bitmap */
+
+	/* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+	s32 meta_dev_idx;
+
+	/* see al_tr_number_to_on_disk_sector() */
+	u32 al_stripes;
+	u32 al_stripe_size_4k;
+	u32 al_size_4k; /* cached product of the above */
+};
+
+struct drbd_backing_dev {
+	struct block_device *backing_bdev;
+	struct block_device *md_bdev;
+	struct drbd_md md;
+	struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
+	sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+	struct page *page;
+	unsigned long start_jif;	/* last call to drbd_md_get_buffer */
+	unsigned long submit_jif;	/* last _drbd_md_sync_page_io() submit */
+	const char *current_use;
+	atomic_t in_use;
+	unsigned int done;
+	int error;
+};
+
+struct bm_io_work {
+	struct drbd_work w;
+	char *why;
+	enum bm_flag flags;
+	int (*io_fn)(struct drbd_device *device);
+	void (*done)(struct drbd_device *device, int rv);
+};
+
+enum write_ordering_e {
+	WO_none,
+	WO_drain_io,
+	WO_bdev_flush,
+};
+
+struct fifo_buffer {
+	unsigned int head_index;
+	unsigned int size;
+	int total; /* sum of all values */
+	int values[0];
+};
+extern struct fifo_buffer *fifo_alloc(int fifo_size);
+
+/* flag bits per connection */
+enum {
+	NET_CONGESTED,		/* The data socket is congested */
+	RESOLVE_CONFLICTS,	/* Set on one node, cleared on the peer! */
+	SEND_PING,		/* whether asender should send a ping asap */
+	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
+	GOT_PING_ACK,		/* set when we receive a ping_ack packet, ping_wait gets woken */
+	CONN_WD_ST_CHG_REQ,	/* A cluster wide state change on the connection is active */
+	CONN_WD_ST_CHG_OKAY,
+	CONN_WD_ST_CHG_FAIL,
+	CONN_DRY_RUN,		/* Expect disconnect after resync handshake. */
+	CREATE_BARRIER,		/* next P_DATA is preceded by a P_BARRIER */
+	STATE_SENT,		/* Do not change state/UUIDs while this is set */
+	CALLBACK_PENDING,	/* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+				 * pending, from drbd worker context.
+				 * If set, bdi_write_congested() returns true,
+				 * so shrink_page_list() would not recurse into,
+				 * and potentially deadlock on, this drbd worker.
+				 */
+	DISCONNECT_SENT,
+
+	DEVICE_WORK_PENDING,	/* tell worker that some device has pending work */
+};
+
+struct drbd_resource {
+	char *name;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_res;
+	struct dentry *debugfs_res_volumes;
+	struct dentry *debugfs_res_connections;
+	struct dentry *debugfs_res_in_flight_summary;
+#endif
+	struct kref kref;
+	struct idr devices;		/* volume number to device mapping */
+	struct list_head connections;
+	struct list_head resources;
+	struct res_opts res_opts;
+	struct mutex conf_update;	/* mutex for ready-copy-update of net_conf and disk_conf */
+	struct mutex adm_mutex;		/* mutex to serialize administrative requests */
+	spinlock_t req_lock;
+
+	unsigned susp:1;		/* IO suspended by user */
+	unsigned susp_nod:1;		/* IO suspended because no data */
+	unsigned susp_fen:1;		/* IO suspended because fence peer handler runs */
+
+	enum write_ordering_e write_ordering;
+
+	cpumask_var_t cpu_mask;
+};
+
+struct drbd_thread_timing_details
+{
+	unsigned long start_jif;
+	void *cb_addr;
+	const char *caller_fn;
+	unsigned int line;
+	unsigned int cb_nr;
+};
+
+struct drbd_connection {
+	struct list_head connections;
+	struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_conn;
+	struct dentry *debugfs_conn_callback_history;
+	struct dentry *debugfs_conn_oldest_requests;
+#endif
+	struct kref kref;
+	struct idr peer_devices;	/* volume number to peer device mapping */
+	enum drbd_conns cstate;		/* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+	struct mutex cstate_mutex;	/* Protects graceful disconnects */
+	unsigned int connect_cnt;	/* Inc each time a connection is established */
+
+	unsigned long flags;
+	struct net_conf *net_conf;	/* content protected by rcu */
+	wait_queue_head_t ping_wait;	/* Woken upon reception of a ping, and a state change */
+
+	struct sockaddr_storage my_addr;
+	int my_addr_len;
+	struct sockaddr_storage peer_addr;
+	int peer_addr_len;
+
+	struct drbd_socket data;	/* data/barrier/cstate/parameter packets */
+	struct drbd_socket meta;	/* ping/ack (metadata) packets */
+	int agreed_pro_version;		/* actually used protocol version */
+	u32 agreed_features;
+	unsigned long last_received;	/* in jiffies, either socket */
+	unsigned int ko_count;
+
+	struct list_head transfer_log;	/* all requests not yet fully processed */
+
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_tfm;  /* checksums we compute, updates protected by connection->data->mutex */
+	struct crypto_hash *peer_integrity_tfm;  /* checksums we verify, only accessed from receiver thread  */
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *verify_tfm;
+	void *int_dig_in;
+	void *int_dig_vv;
+
+	/* receiver side */
+	struct drbd_epoch *current_epoch;
+	spinlock_t epoch_lock;
+	unsigned int epochs;
+	atomic_t current_tle_nr;	/* transfer log epoch number */
+	unsigned current_tle_writes;	/* writes seen within this tl epoch */
+
+	unsigned long last_reconnect_jif;
+	struct drbd_thread receiver;
+	struct drbd_thread worker;
+	struct drbd_thread asender;
+
+	/* cached pointers,
+	 * so we can look up the oldest pending requests more quickly.
+	 * protected by resource->req_lock */
+	struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+	struct drbd_request *req_ack_pending;
+	struct drbd_request *req_not_net_done;
+
+	/* sender side */
+	struct drbd_work_queue sender_work;
+
+#define DRBD_THREAD_DETAILS_HIST	16
+	unsigned int w_cb_nr; /* keeps counting up */
+	unsigned int r_cb_nr; /* keeps counting up */
+	struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+	struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
+	struct {
+		/* whether this sender thread
+		 * has processed a single write yet. */
+		bool seen_any_write_yet;
+
+		/* Which barrier number to send with the next P_BARRIER */
+		int current_epoch_nr;
+
+		/* how many write requests have been sent
+		 * with req->epoch == current_epoch_nr.
+		 * If none, no P_BARRIER will be sent. */
+		unsigned current_epoch_writes;
+	} send;
+};
+
+void __update_timing_details(
+		struct drbd_thread_timing_details *tdp,
+		unsigned int *cb_nr,
+		void *cb,
+		const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+	__update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+	__update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
+struct submit_worker {
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	/* protected by ..->resource->req_lock */
+	struct list_head writes;
+};
+
+struct drbd_peer_device {
+	struct list_head peer_devices;
+	struct drbd_device *device;
+	struct drbd_connection *connection;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_peer_dev;
+#endif
+};
+
+struct drbd_device {
+	struct drbd_resource *resource;
+	struct list_head peer_devices;
+	struct list_head pending_bitmap_io;
+
+	unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_minor;
+	struct dentry *debugfs_vol;
+	struct dentry *debugfs_vol_oldest_requests;
+	struct dentry *debugfs_vol_act_log_extents;
+	struct dentry *debugfs_vol_resync_extents;
+	struct dentry *debugfs_vol_data_gen_id;
+#endif
+
+	unsigned int vnr;	/* volume number within the connection */
+	unsigned int minor;	/* device minor number */
+
+	struct kref kref;
+
+	/* things that are stored as / read from meta data on disk */
+	unsigned long flags;
+
+	/* configured by drbdsetup */
+	struct drbd_backing_dev *ldev __protected_by(local);
+
+	sector_t p_size;     /* partner's disk size */
+	struct request_queue *rq_queue;
+	struct block_device *this_bdev;
+	struct gendisk	    *vdisk;
+
+	unsigned long last_reattach_jif;
+	struct drbd_work resync_work;
+	struct drbd_work unplug_work;
+	struct timer_list resync_timer;
+	struct timer_list md_sync_timer;
+	struct timer_list start_resync_timer;
+	struct timer_list request_timer;
+
+	/* Used after attach while negotiating new disk state. */
+	union drbd_state new_state_tmp;
+
+	union drbd_dev_state state;
+	wait_queue_head_t misc_wait;
+	wait_queue_head_t state_wait;  /* upon each state change. */
+	unsigned int send_cnt;
+	unsigned int recv_cnt;
+	unsigned int read_cnt;
+	unsigned int writ_cnt;
+	unsigned int al_writ_cnt;
+	unsigned int bm_writ_cnt;
+	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
+	atomic_t ap_actlog_cnt;  /* Requests waiting for activity log */
+	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+	atomic_t unacked_cnt;	 /* Need to send replies for */
+	atomic_t local_cnt;	 /* Waiting for local completion */
+
+	/* Interval tree of pending local requests */
+	struct rb_root read_requests;
+	struct rb_root write_requests;
+
+	/* for statistics and timeouts */
+	/* [0] read, [1] write */
+	struct list_head pending_master_completion[2];
+	struct list_head pending_completion[2];
+
+	/* use checksums for *this* resync */
+	bool use_csums;
+	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
+	unsigned long rs_total;
+	/* number of resync blocks that failed in this run */
+	unsigned long rs_failed;
+	/* Syncer's start time [unit jiffies] */
+	unsigned long rs_start;
+	/* cumulated time in PausedSyncX state [unit jiffies] */
+	unsigned long rs_paused;
+	/* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+	unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
+	/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+	unsigned long rs_mark_left[DRBD_SYNC_MARKS];
+	/* marks's time [unit jiffies] */
+	unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+	/* current index into rs_mark_{left,time} */
+	int rs_last_mark;
+	unsigned long rs_last_bcast; /* [unit jiffies] */
+
+	/* where does the admin want us to start? (sector) */
+	sector_t ov_start_sector;
+	sector_t ov_stop_sector;
+	/* where are we now? (sector) */
+	sector_t ov_position;
+	/* Start sector of out of sync range (to merge printk reporting). */
+	sector_t ov_last_oos_start;
+	/* size of out-of-sync range in sectors. */
+	sector_t ov_last_oos_size;
+	unsigned long ov_left; /* in bits */
+
+	struct drbd_bitmap *bitmap;
+	unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+	/* Used to track operations of resync... */
+	struct lru_cache *resync;
+	/* Number of locked elements in resync LRU */
+	unsigned int resync_locked;
+	/* resync extent number waiting for application requests */
+	unsigned int resync_wenr;
+
+	int open_cnt;
+	u64 *p_uuid;
+
+	struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+	struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
+	struct list_head done_ee;   /* need to send P_WRITE_ACK */
+	struct list_head read_ee;   /* [RS]P_DATA_REQUEST being read */
+	struct list_head net_ee;    /* zero-copy network send in progress */
+
+	int next_barrier_nr;
+	struct list_head resync_reads;
+	atomic_t pp_in_use;		/* allocated from page pool */
+	atomic_t pp_in_use_by_net;	/* sendpage()d, still referenced by tcp */
+	wait_queue_head_t ee_wait;
+	struct drbd_md_io md_io;
+	spinlock_t al_lock;
+	wait_queue_head_t al_wait;
+	struct lru_cache *act_log;	/* activity log */
+	unsigned int al_tr_number;
+	int al_tr_cycle;
+	wait_queue_head_t seq_wait;
+	atomic_t packet_seq;
+	unsigned int peer_seq;
+	spinlock_t peer_seq_lock;
+	unsigned long comm_bm_set; /* communicated number of set bits. */
+	struct bm_io_work bm_io_work;
+	u64 ed_uuid; /* UUID of the exposed data */
+	struct mutex own_state_mutex;
+	struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */
+	char congestion_reason;  /* Why we where congested... */
+	atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+	atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+	int rs_last_sect_ev; /* counter to compare with */
+	int rs_last_events;  /* counter of read or write "events" (unit sectors)
+			      * on the lower level device when we last looked. */
+	int c_sync_rate; /* current resync rate after syncer throttle magic */
+	struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */
+	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+	atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+	unsigned int peer_max_bio_size;
+	unsigned int local_max_bio_size;
+
+	/* any requests that would block in drbd_make_request()
+	 * are deferred to this single-threaded work queue */
+	struct submit_worker submit;
+};
+
+struct drbd_bm_aio_ctx {
+	struct drbd_device *device;
+	struct list_head list; /* on device->pending_bitmap_io */;
+	unsigned long start_jif;
+	atomic_t in_flight;
+	unsigned int done;
+	unsigned flags;
+#define BM_AIO_COPY_PAGES	1
+#define BM_AIO_WRITE_HINTED	2
+#define BM_AIO_WRITE_ALL_PAGES	4
+#define BM_AIO_READ		8
+	int error;
+	struct kref kref;
+};
+
+struct drbd_config_context {
+	/* assigned from drbd_genlmsghdr */
+	unsigned int minor;
+	/* assigned from request attributes, if present */
+	unsigned int volume;
+#define VOLUME_UNSPECIFIED		(-1U)
+	/* pointer into the request skb,
+	 * limited lifetime! */
+	char *resource_name;
+	struct nlattr *my_addr;
+	struct nlattr *peer_addr;
+
+	/* reply buffer */
+	struct sk_buff *reply_skb;
+	/* pointer into reply buffer */
+	struct drbd_genlmsghdr *reply_dh;
+	/* resolved from attributes, if possible */
+	struct drbd_device *device;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+};
+
+static inline struct drbd_device *minor_to_device(unsigned int minor)
+{
+	return (struct drbd_device *)idr_find(&drbd_devices, minor);
+}
+
+static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
+{
+	return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
+}
+
+#define for_each_resource(resource, _resources) \
+	list_for_each_entry(resource, _resources, resources)
+
+#define for_each_resource_rcu(resource, _resources) \
+	list_for_each_entry_rcu(resource, _resources, resources)
+
+#define for_each_resource_safe(resource, tmp, _resources) \
+	list_for_each_entry_safe(resource, tmp, _resources, resources)
+
+#define for_each_connection(connection, resource) \
+	list_for_each_entry(connection, &resource->connections, connections)
+
+#define for_each_connection_rcu(connection, resource) \
+	list_for_each_entry_rcu(connection, &resource->connections, connections)
+
+#define for_each_connection_safe(connection, tmp, resource) \
+	list_for_each_entry_safe(connection, tmp, &resource->connections, connections)
+
+#define for_each_peer_device(peer_device, device) \
+	list_for_each_entry(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_rcu(peer_device, device) \
+	list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_safe(peer_device, tmp, device) \
+	list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices)
+
+static inline unsigned int device_to_minor(struct drbd_device *device)
+{
+	return device->minor;
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum dds_flags {
+	DDSF_FORCED    = 1,
+	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
+extern void drbd_init_set_defaults(struct drbd_device *device);
+extern int  drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#endif
+extern void tl_release(struct drbd_connection *, unsigned int barrier_nr,
+		       unsigned int set_size);
+extern void tl_clear(struct drbd_connection *);
+extern void drbd_free_sock(struct drbd_connection *connection);
+extern int drbd_send(struct drbd_connection *connection, struct socket *sock,
+		     void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t,
+			 unsigned);
+
+extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_connection *connection);
+extern int drbd_send_uuids(struct drbd_peer_device *);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *);
+extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags);
+extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_peer_device *);
+extern int drbd_send_sync_param(struct drbd_peer_device *);
+extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr,
+			    u32 set_size);
+extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet,
+			 struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet,
+			     struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet,
+			     struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet,
+			    sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet,
+			   struct drbd_peer_request *);
+extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req);
+extern int drbd_send_drequest(struct drbd_peer_device *, int cmd,
+			      sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector,
+				   int size, void *digest, int digest_size,
+				   enum drbd_packet cmd);
+extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size);
+
+extern int drbd_send_bitmap(struct drbd_device *device);
+extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
+extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
+extern void drbd_device_cleanup(struct drbd_device *device);
+void drbd_print_uuids(struct drbd_device *device, const char *text);
+
+extern void conn_md_sync(struct drbd_connection *connection);
+extern void drbd_md_write(struct drbd_device *device, void *buffer);
+extern void drbd_md_sync(struct drbd_device *device);
+extern int  drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local);
+extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local);
+extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+extern void drbd_md_mark_dirty(struct drbd_device *device);
+extern void drbd_queue_bitmap_io(struct drbd_device *device,
+				 int (*io_fn)(struct drbd_device *),
+				 void (*done)(struct drbd_device *, int),
+				 char *why, enum bm_flag flags);
+extern int drbd_bitmap_io(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags);
+extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
+
+/* Meta data layout
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ *   |----------- md_size_sect ------------------|
+ *   [ 4k superblock ][ activity log ][  Bitmap  ]
+ *   | al_offset == 8 |
+ *   | bm_offset = al_offset + X      |
+ *  ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ *  Variants:
+ *     old, indexed fixed size meta data:
+ *
+ * internal:
+ *            |----------- md_size_sect ------------------|
+ * [data.....][  Bitmap  ][ activity log ][ 4k superblock ][padding*]
+ *                        | al_offset < 0 |
+ *            | bm_offset = al_offset - Y |
+ *  ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ *  [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ *  end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ *  The activity log consists of 4k transaction blocks,
+ *  which are written in a ring-buffer, or striped ring-buffer like fashion,
+ *  which are writtensize used to be fixed 32kB,
+ *  but is about to become configurable.
+ */
+
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_128MB_SECT (128LLU << 11)  /* 128 MB, unit sectors */
+#define MD_4kB_SECT	 8
+#define MD_32kB_SECT	64
+
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ *   This many changes to the active set can be logged with one transaction.
+ *   This number is arbitrary.
+ * context per transaction:
+ *   This many context extent numbers are logged with each transaction.
+ *   This number is resulting from the transaction block size (4k), the layout
+ *   of the transaction header, and the number of updates per transaction.
+ *   See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION	 64	// arbitrary
+#define AL_CONTEXT_PER_TRANSACTION	919	// (4096 - 36 - 6*64)/4
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+	int rs_left; /* number of bits set (out of sync) in this extent. */
+	int rs_failed; /* number of failed resync requests in this extent. */
+	unsigned long flags;
+	struct lc_element lce;
+};
+
+#define BME_NO_WRITES  0  /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED     1  /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY   2  /* finish resync IO on this extent ASAP! App IO waiting! */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define SLEEP_TIME (HZ/10)
+
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT	12			 /* 4k per bit */
+#define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SHIFT)
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT	 24	/* 16 MiB per resync extent */
+#define BM_EXT_SIZE	 (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x)   ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT     BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+#define BM_BIT_TO_EXT(x)    ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+/* first storage sector a bitmap extent corresponds to */
+#define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+/* how much _storage_ sectors we have per bitmap extent */
+#define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+/* how many bits are covered by one bitmap extent (resync extent) */
+#define BM_BITS_PER_EXT     (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_MASK  (BM_BITS_PER_EXT - 1)
+
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ *		     of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit	 0	  bit 37   bit 38	     bit (512*8)-1
+ *	     ...|........|........|.. // ..|........|
+ * sect. 0	 `296	  `304			   ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 )  // 128
+#define BM_EXT_PER_SECT	    ( 512 / BM_BYTES_PER_EXTENT )	 //   4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+	  ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_FIXED_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
+#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
+/* corresponds to (1UL << 38) bits right now. */
+#endif
+#endif
+
+/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12)       /* Works always = 4k */
+
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95    (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
+
+/* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+#define DRBD_MAX_DISCARD_SIZE	AL_EXTENT_SIZE
+#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+
+extern int  drbd_bm_init(struct drbd_device *device);
+extern int  drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
+extern void drbd_bm_cleanup(struct drbd_device *device);
+extern void drbd_bm_set_all(struct drbd_device *device);
+extern void drbd_bm_clear_all(struct drbd_device *device);
+/* set/clear/test only a few bits at a time */
+extern int  drbd_bm_set_bits(
+		struct drbd_device *device, unsigned long s, unsigned long e);
+extern int  drbd_bm_clear_bits(
+		struct drbd_device *device, unsigned long s, unsigned long e);
+extern int drbd_bm_count_bits(
+	struct drbd_device *device, const unsigned long s, const unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock,
+ * may process the whole bitmap in one go */
+extern void _drbd_bm_set_bits(struct drbd_device *device,
+		const unsigned long s, const unsigned long e);
+extern int  drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
+extern int  drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
+extern int  drbd_bm_read(struct drbd_device *device) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
+extern int  drbd_bm_write(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
+extern size_t	     drbd_bm_words(struct drbd_device *device);
+extern unsigned long drbd_bm_bits(struct drbd_device *device);
+extern sector_t      drbd_bm_capacity(struct drbd_device *device);
+
+#define DRBD_END_OF_BITMAP	(~(unsigned long)0)
+extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
+extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
+		size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap */
+extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset,
+		size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags);
+extern void drbd_bm_unlock(struct drbd_device *device);
+/* drbd_main.c */
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache;	/* peer requests */
+extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+extern mempool_t *drbd_request_mempool;
+extern mempool_t *drbd_ee_mempool;
+
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
+extern spinlock_t   drbd_pp_lock;
+extern int	    drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES	128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
+extern rwlock_t global_state_lock;
+
+extern int conn_lowest_minor(struct drbd_connection *connection);
+extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
+extern void drbd_destroy_device(struct kref *kref);
+extern void drbd_delete_device(struct drbd_device *device);
+
+extern struct drbd_resource *drbd_create_resource(const char *name);
+extern void drbd_free_resource(struct drbd_resource *resource);
+
+extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts);
+extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts);
+extern void drbd_destroy_connection(struct kref *kref);
+extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+					    void *peer_addr, int peer_addr_len);
+extern struct drbd_resource *drbd_find_resource(const char *name);
+extern void drbd_destroy_resource(struct kref *kref);
+extern void conn_free_crypto(struct drbd_connection *connection);
+
+extern int proc_details;
+
+/* drbd_req */
+extern void do_submit(struct work_struct *ws);
+extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
+extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+extern void drbd_suspend_io(struct drbd_device *device);
+extern void drbd_resume_io(struct drbd_device *device);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int);
+enum determine_dev_size {
+	DS_ERROR_SHRINK = -3,
+	DS_ERROR_SPACE_MD = -2,
+	DS_ERROR = -1,
+	DS_UNCHANGED = 0,
+	DS_SHRUNK = 1,
+	DS_GREW = 2,
+	DS_GREW_FROM_ZERO = 3,
+};
+extern enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_device *);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
+					enum drbd_role new_role,
+					int force);
+extern bool conn_try_outdate_peer(struct drbd_connection *connection);
+extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
+extern int drbd_khelper(struct drbd_device *device, char *cmd);
+
+/* drbd_worker.c */
+/* bi_end_io handlers */
+extern void drbd_md_endio(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
+extern int drbd_worker(struct drbd_thread *thi);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
+void drbd_resync_after_changed(struct drbd_device *device);
+extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_device *device);
+extern void suspend_other_sg(struct drbd_device *device);
+extern int drbd_resync_finished(struct drbd_device *device);
+/* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
+extern void drbd_md_put_buffer(struct drbd_device *device);
+extern int drbd_md_sync_page_io(struct drbd_device *device,
+		struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int);
+extern void wait_until_done_or_force_detached(struct drbd_device *device,
+		struct drbd_backing_dev *bdev, unsigned int *done);
+extern void drbd_rs_controller_reset(struct drbd_device *device);
+
+static inline void ov_out_of_sync_print(struct drbd_device *device)
+{
+	if (device->ov_last_oos_size) {
+		drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n",
+		     (unsigned long long)device->ov_last_oos_start,
+		     (unsigned long)device->ov_last_oos_size);
+	}
+	device->ov_last_oos_size = 0;
+}
+
+
+extern void drbd_csum_bio(struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct crypto_hash *, struct drbd_peer_request *, void *);
+/* worker callbacks */
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+extern int w_start_resync(struct drbd_work *, int);
+
+extern void resync_timer_fn(unsigned long data);
+extern void start_resync_timer_fn(unsigned long data);
+
+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
+/* drbd_receiver.c */
+extern int drbd_receiver(struct drbd_thread *thi);
+extern int drbd_asender(struct drbd_thread *thi);
+extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+		bool throttle_if_app_is_waiting);
+extern int drbd_submit_peer_request(struct drbd_device *,
+				    struct drbd_peer_request *, const unsigned,
+				    const int);
+extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
+						     sector_t, unsigned int,
+						     bool,
+						     gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
+				 int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
+extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
+extern int drbd_connected(struct drbd_peer_device *);
+
+static inline void drbd_tcp_cork(struct socket *sock)
+{
+	int val = 1;
+	(void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_uncork(struct socket *sock)
+{
+	int val = 0;
+	(void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+	int val = 1;
+	(void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			(char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+	int val = 2;
+	(void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			(char*)&val, sizeof(val));
+}
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_device *device,
+					sector_t size)
+{
+	/* set_capacity(device->this_bdev->bd_disk, size); */
+	set_capacity(device->vdisk, size);
+	device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_device *device,
+					     int fault_type, struct bio *bio)
+{
+	__release(local);
+	if (!bio->bi_bdev) {
+		drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
+		bio_endio(bio, -ENODEV);
+		return;
+	}
+
+	if (drbd_insert_fault(device, fault_type))
+		bio_endio(bio, -EIO);
+	else
+		generic_make_request(bio);
+}
+
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+			      enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+extern const struct file_operations drbd_proc_fops;
+extern const char *drbd_conn_str(enum drbd_conns s);
+extern const char *drbd_role_str(enum drbd_role s);
+
+/* drbd_actlog.c */
+extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_device *device);
+extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_device *device);
+extern int drbd_rs_del_all(struct drbd_device *device);
+extern void drbd_rs_failed_io(struct drbd_device *device,
+		sector_t sector, int size);
+extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
+
+enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
+extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+		enum update_sync_bits_mode mode);
+#define drbd_set_in_sync(device, sector, size) \
+	__drbd_change_sync(device, sector, size, SET_IN_SYNC)
+#define drbd_set_out_of_sync(device, sector, size) \
+	__drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC)
+#define drbd_rs_failed_io(device, sector, size) \
+	__drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
+extern void drbd_al_shrink(struct drbd_device *device);
+extern int drbd_initialize_al(struct drbd_device *, void *);
+
+/* drbd_nl.c */
+/* state info broadcast */
+struct sib_info {
+	enum drbd_state_info_bcast_reason sib_reason;
+	union {
+		struct {
+			char *helper_name;
+			unsigned helper_exit_code;
+		};
+		struct {
+			union drbd_state os;
+			union drbd_state ns;
+		};
+	};
+};
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
+
+/*
+ * inline helper functions
+ *************************/
+
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+	return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+	for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+			page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
+{
+	struct page *page = peer_req->pages;
+	page_chain_for_each(page) {
+		if (page_count(page) > 1)
+			return 1;
+	}
+	return 0;
+}
+
+static inline enum drbd_state_rv
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+		enum chg_state_flags flags, struct completion *done)
+{
+	enum drbd_state_rv rv;
+
+	read_lock(&global_state_lock);
+	rv = __drbd_set_state(device, ns, flags, done);
+	read_unlock(&global_state_lock);
+
+	return rv;
+}
+
+static inline union drbd_state drbd_read_state(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+	union drbd_state rv;
+
+	rv.i = device->state.i;
+	rv.susp = resource->susp;
+	rv.susp_nod = resource->susp_nod;
+	rv.susp_fen = resource->susp_fen;
+
+	return rv;
+}
+
+enum drbd_force_detach_flags {
+	DRBD_READ_ERROR,
+	DRBD_WRITE_ERROR,
+	DRBD_META_IO_ERROR,
+	DRBD_FORCE_DETACH,
+};
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_device *device,
+		enum drbd_force_detach_flags df,
+		const char *where)
+{
+	enum drbd_io_error_p ep;
+
+	rcu_read_lock();
+	ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+	rcu_read_unlock();
+	switch (ep) {
+	case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
+		if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
+			if (__ratelimit(&drbd_ratelimit_state))
+				drbd_err(device, "Local IO failed in %s.\n", where);
+			if (device->state.disk > D_INCONSISTENT)
+				_drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
+			break;
+		}
+		/* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
+	case EP_DETACH:
+	case EP_CALL_HELPER:
+		/* Remember whether we saw a READ or WRITE error.
+		 *
+		 * Recovery of the affected area for WRITE failure is covered
+		 * by the activity log.
+		 * READ errors may fall outside that area though. Certain READ
+		 * errors can be "healed" by writing good data to the affected
+		 * blocks, which triggers block re-allocation in lower layers.
+		 *
+		 * If we can not write the bitmap after a READ error,
+		 * we may need to trigger a full sync (see w_go_diskless()).
+		 *
+		 * Force-detach is not really an IO error, but rather a
+		 * desperate measure to try to deal with a completely
+		 * unresponsive lower level IO stack.
+		 * Still it should be treated as a WRITE error.
+		 *
+		 * Meta IO error is always WRITE error:
+		 * we read meta data only once during attach,
+		 * which will fail in case of errors.
+		 */
+		set_bit(WAS_IO_ERROR, &device->flags);
+		if (df == DRBD_READ_ERROR)
+			set_bit(WAS_READ_ERROR, &device->flags);
+		if (df == DRBD_FORCE_DETACH)
+			set_bit(FORCE_DETACH, &device->flags);
+		if (device->state.disk > D_FAILED) {
+			_drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);
+			drbd_err(device,
+				"Local IO failed in %s. Detaching...\n", where);
+		}
+		break;
+	}
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @device:	 DRBD device.
+ * @error:	 Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_device *device,
+	int error, enum drbd_force_detach_flags forcedetach, const char *where)
+{
+	if (error) {
+		unsigned long flags;
+		spin_lock_irqsave(&device->resource->req_lock, flags);
+		__drbd_chk_io_error_(device, forcedetach, where);
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	}
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev:	Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + bdev->md.bm_offset;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset;
+	}
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + MD_4kB_SECT -1;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset + bdev->md.md_size_sect -1;
+	}
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+	/* return bdev ? get_capacity(bdev->bd_disk) : 0; */
+	return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev:	Meta data block device.
+ *
+ * returns the capacity we announce to out peer.  we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+	sector_t s;
+
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		s = drbd_get_capacity(bdev->backing_bdev)
+			? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_md_first_sector(bdev))
+			: 0;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_get_capacity(bdev->backing_bdev));
+		/* clip at maximum size the meta device can support */
+		s = min_t(sector_t, s,
+			BM_EXT_TO_SECT(bdev->md.md_size_sect
+				     - bdev->md.bm_offset));
+		break;
+	default:
+		s = min_t(sector_t, DRBD_MAX_SECTORS,
+				drbd_get_capacity(bdev->backing_bdev));
+	}
+	return s;
+}
+
+/**
+ * drbd_md_ss() - Return the sector number of our meta data super block
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
+{
+	const int meta_dev_idx = bdev->md.meta_dev_idx;
+
+	if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
+		return 0;
+
+	/* Since drbd08, internal meta data is always "flexible".
+	 * position: last 4k aligned block of 4k size */
+	if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	    meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+		return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+	/* external, some index; this is the old fixed size layout */
+	return MD_128MB_SECT * bdev->md.meta_dev_idx;
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add_tail(&w->list, &q->q);
+	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	if (list_empty_careful(&w->list))
+		list_add_tail(&w->list, &q->q);
+	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_device_post_work(struct drbd_device *device, int work_bit)
+{
+	if (!test_and_set_bit(work_bit, &device->flags)) {
+		struct drbd_connection *connection =
+			first_peer_device(device)->connection;
+		struct drbd_work_queue *q = &connection->sender_work;
+		if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+			wake_up(&q->q_wait);
+	}
+}
+
+extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
+
+static inline void wake_asender(struct drbd_connection *connection)
+{
+	if (test_bit(SIGNAL_ASENDER, &connection->flags))
+		force_sig(DRBD_SIG, connection->asender.task);
+}
+
+static inline void request_ping(struct drbd_connection *connection)
+{
+	set_bit(SEND_PING, &connection->flags);
+	wake_asender(connection);
+}
+
+extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_connection *, struct drbd_socket *,
+			     enum drbd_packet, unsigned int, void *,
+			     unsigned int);
+extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *,
+			     enum drbd_packet, unsigned int, void *,
+			     unsigned int);
+
+extern int drbd_send_ping(struct drbd_connection *connection);
+extern int drbd_send_ping_ack(struct drbd_connection *connection);
+extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state);
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, false, true);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, false, false);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, true, false);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ *  w_send_barrier
+ *  _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
+ *    it is much easier and equally valid to count what we queue for the
+ *    worker, even before it actually was queued or send.
+ *    (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ *  got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ *  _req_mod(req, DATA_RECEIVED)
+ *     [from receive_DataReply]
+ *  _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
+ *     [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ *     for some reason it is NOT decreased in got_NegAck,
+ *     but in the resulting cleanup code from report_params.
+ *     we should try to remember the reason for that...
+ *  _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ *  _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
+ *     [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_device *device)
+{
+	atomic_inc(&device->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line)			\
+	if (atomic_read(&device->which) < 0)				\
+		drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n",	\
+			func, line,					\
+			atomic_read(&device->which))
+
+#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
+{
+	if (atomic_dec_and_test(&device->ap_pending_cnt))
+		wake_up(&device->misc_wait);
+	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
+
+/* counts how many resync-related answers we still expect from the peer
+ *		     increase			decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY   (and expects P_WRITE_ACK with ID_SYNCER)
+ *					   (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_device *device)
+{
+	atomic_inc(&device->rs_pending_cnt);
+}
+
+#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
+{
+	atomic_dec(&device->rs_pending_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ *  receive_Data	unless protocol A;
+ *			we need to send a P_RECV_ACK (proto B)
+ *			or P_WRITE_ACK (proto C)
+ *  receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ *  receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ *  receive_Barrier_*	we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_device *device)
+{
+	atomic_inc(&device->unacked_cnt);
+}
+
+#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
+static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
+{
+	atomic_dec(&device->unacked_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
+static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
+{
+	atomic_sub(n, &device->unacked_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+static inline bool is_sync_state(enum drbd_conns connection_state)
+{
+	return
+	   (connection_state == C_SYNC_SOURCE
+	||  connection_state == C_SYNC_TARGET
+	||  connection_state == C_PAUSED_SYNC_S
+	||  connection_state == C_PAUSED_SYNC_T);
+}
+
+/**
+ * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
+ * @_device:		DRBD device.
+ * @_min_state:		Minimum device state required for success.
+ *
+ * You have to call put_ldev() when finished working with device->ldev.
+ */
+#define get_ldev_if_state(_device, _min_state)				\
+	(_get_ldev_if_state((_device), (_min_state)) ?			\
+	 ({ __acquire(x); true; }) : false)
+#define get_ldev(_device) get_ldev_if_state(_device, D_INCONSISTENT)
+
+static inline void put_ldev(struct drbd_device *device)
+{
+	enum drbd_disk_state disk_state = device->state.disk;
+	/* We must check the state *before* the atomic_dec becomes visible,
+	 * or we have a theoretical race where someone hitting zero,
+	 * while state still D_FAILED, will then see D_DISKLESS in the
+	 * condition below and calling into destroy, where he must not, yet. */
+	int i = atomic_dec_return(&device->local_cnt);
+
+	/* This may be called from some endio handler,
+	 * so we must not sleep here. */
+
+	__release(local);
+	D_ASSERT(device, i >= 0);
+	if (i == 0) {
+		if (disk_state == D_DISKLESS)
+			/* even internal references gone, safe to destroy */
+			drbd_device_post_work(device, DESTROY_DISK);
+		if (disk_state == D_FAILED)
+			/* all application IO references gone. */
+			if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
+				drbd_device_post_work(device, GO_DISKLESS);
+		wake_up(&device->misc_wait);
+	}
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	/* never get a reference while D_DISKLESS */
+	if (device->state.disk == D_DISKLESS)
+		return 0;
+
+	atomic_inc(&device->local_cnt);
+	io_allowed = (device->state.disk >= mins);
+	if (!io_allowed)
+		put_ldev(device);
+	return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
+#endif
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_device *device)
+{
+	struct net_conf *nc;
+	int mxb;
+
+	rcu_read_lock();
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	mxb = nc ? nc->max_buffers : 1000000;  /* arbitrary limit on open requests */
+	rcu_read_unlock();
+
+	return mxb;
+}
+
+static inline int drbd_state_is_stable(struct drbd_device *device)
+{
+	union drbd_dev_state s = device->state;
+
+	/* DO NOT add a default clause, we want the compiler to warn us
+	 * for any newly introduced state we may have forgotten to add here */
+
+	switch ((enum drbd_conns)s.conn) {
+	/* new io only accepted when there is no connection, ... */
+	case C_STANDALONE:
+	case C_WF_CONNECTION:
+	/* ... or there is a well established connection. */
+	case C_CONNECTED:
+	case C_SYNC_SOURCE:
+	case C_SYNC_TARGET:
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+	case C_PAUSED_SYNC_S:
+	case C_PAUSED_SYNC_T:
+	case C_AHEAD:
+	case C_BEHIND:
+		/* transitional states, IO allowed */
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_REPORT_PARAMS:
+	case C_STARTING_SYNC_S:
+	case C_STARTING_SYNC_T:
+		break;
+
+		/* Allow IO in BM exchange states with new protocols */
+	case C_WF_BITMAP_S:
+		if (first_peer_device(device)->connection->agreed_pro_version < 96)
+			return 0;
+		break;
+
+		/* no new io accepted in these states */
+	case C_WF_BITMAP_T:
+	case C_WF_SYNC_UUID:
+	case C_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	switch ((enum drbd_disk_state)s.disk) {
+	case D_DISKLESS:
+	case D_INCONSISTENT:
+	case D_OUTDATED:
+	case D_CONSISTENT:
+	case D_UP_TO_DATE:
+	case D_FAILED:
+		/* disk state is stable as well. */
+		break;
+
+	/* no new io accepted during transitional states */
+	case D_ATTACHING:
+	case D_NEGOTIATING:
+	case D_UNKNOWN:
+	case D_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int drbd_suspended(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+
+	return resource->susp || resource->susp_fen || resource->susp_nod;
+}
+
+static inline bool may_inc_ap_bio(struct drbd_device *device)
+{
+	int mxb = drbd_get_max_buffers(device);
+
+	if (drbd_suspended(device))
+		return false;
+	if (test_bit(SUSPEND_IO, &device->flags))
+		return false;
+
+	/* to avoid potential deadlock or bitmap corruption,
+	 * in various places, we only allow new application io
+	 * to start during "stable" states. */
+
+	/* no new io accepted when attaching or detaching the disk */
+	if (!drbd_state_is_stable(device))
+		return false;
+
+	/* since some older kernels don't have atomic_add_unless,
+	 * and we are within the spinlock anyways, we have this workaround.  */
+	if (atomic_read(&device->ap_bio_cnt) > mxb)
+		return false;
+	if (test_bit(BITMAP_IO, &device->flags))
+		return false;
+	return true;
+}
+
+static inline bool inc_ap_bio_cond(struct drbd_device *device)
+{
+	bool rv = false;
+
+	spin_lock_irq(&device->resource->req_lock);
+	rv = may_inc_ap_bio(device);
+	if (rv)
+		atomic_inc(&device->ap_bio_cnt);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	return rv;
+}
+
+static inline void inc_ap_bio(struct drbd_device *device)
+{
+	/* we wait here
+	 *    as long as the device is suspended
+	 *    until the bitmap is no longer on the fly during connection
+	 *    handshake as long as we would exceed the max_buffer limit.
+	 *
+	 * to avoid races with the reconnect code,
+	 * we need to atomic_inc within the spinlock. */
+
+	wait_event(device->misc_wait, inc_ap_bio_cond(device));
+}
+
+static inline void dec_ap_bio(struct drbd_device *device)
+{
+	int mxb = drbd_get_max_buffers(device);
+	int ap_bio = atomic_dec_return(&device->ap_bio_cnt);
+
+	D_ASSERT(device, ap_bio >= 0);
+
+	if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+			drbd_queue_work(&first_peer_device(device)->
+				connection->sender_work,
+				&device->bm_io_work.w);
+	}
+
+	/* this currently does wake_up for every dec_ap_bio!
+	 * maybe rather introduce some type of hysteresis?
+	 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+	if (ap_bio < mxb)
+		wake_up(&device->misc_wait);
+}
+
+static inline bool verify_can_do_stop_sector(struct drbd_device *device)
+{
+	return first_peer_device(device)->connection->agreed_pro_version >= 97 &&
+		first_peer_device(device)->connection->agreed_pro_version != 100;
+}
+
+static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val)
+{
+	int changed = device->ed_uuid != val;
+	device->ed_uuid = val;
+	return changed;
+}
+
+static inline int drbd_queue_order_type(struct drbd_device *device)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+	return QUEUE_ORDERED_NONE;
+}
+
+static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
+{
+	return list_first_entry_or_null(&resource->connections,
+				struct drbd_connection, connections);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 000000000..51b25ad85
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,179 @@
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/**
+ * interval_end  -  return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+	struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+	return this->end;
+}
+
+/**
+ * compute_subtree_last  -  compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children.  Called for @node and its parents whenever the end
+ * may have changed.
+ */
+static inline sector_t
+compute_subtree_last(struct drbd_interval *node)
+{
+	sector_t max = node->sector + (node->size >> 9);
+
+	if (node->rb.rb_left) {
+		sector_t left = interval_end(node->rb.rb_left);
+		if (left > max)
+			max = left;
+	}
+	if (node->rb.rb_right) {
+		sector_t right = interval_end(node->rb.rb_right);
+		if (right > max)
+			max = right;
+	}
+	return max;
+}
+
+RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb,
+		     sector_t, end, compute_subtree_last);
+
+/**
+ * drbd_insert_interval  -  insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+	sector_t this_end = this->sector + (this->size >> 9);
+
+	BUG_ON(!IS_ALIGNED(this->size, 512));
+
+	while (*new) {
+		struct drbd_interval *here =
+			rb_entry(*new, struct drbd_interval, rb);
+
+		parent = *new;
+		if (here->end < this_end)
+			here->end = this_end;
+		if (this->sector < here->sector)
+			new = &(*new)->rb_left;
+		else if (this->sector > here->sector)
+			new = &(*new)->rb_right;
+		else if (this < here)
+			new = &(*new)->rb_left;
+		else if (this > here)
+			new = &(*new)->rb_right;
+		else
+			return false;
+	}
+
+	this->end = this_end;
+	rb_link_node(&this->rb, parent, new);
+	rb_insert_augmented(&this->rb, root, &augment_callbacks);
+	return true;
+}
+
+/**
+ * drbd_contains_interval  -  check if a tree contains a given interval
+ * @sector:	start sector of @interval
+ * @interval:	may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree.  Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+		       struct drbd_interval *interval)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct drbd_interval *here =
+			rb_entry(node, struct drbd_interval, rb);
+
+		if (sector < here->sector)
+			node = node->rb_left;
+		else if (sector > here->sector)
+			node = node->rb_right;
+		else if (interval < here)
+			node = node->rb_left;
+		else if (interval > here)
+			node = node->rb_right;
+		else
+			return true;
+	}
+	return false;
+}
+
+/**
+ * drbd_remove_interval  -  remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+	rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap  - search for an interval overlapping with [sector, sector + size)
+ * @sector:	start sector
+ * @size:	size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none.  When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+	struct rb_node *node = root->rb_node;
+	struct drbd_interval *overlap = NULL;
+	sector_t end = sector + (size >> 9);
+
+	BUG_ON(!IS_ALIGNED(size, 512));
+
+	while (node) {
+		struct drbd_interval *here =
+			rb_entry(node, struct drbd_interval, rb);
+
+		if (node->rb_left &&
+		    sector < interval_end(node->rb_left)) {
+			/* Overlap if any must be on left side */
+			node = node->rb_left;
+		} else if (here->sector < end &&
+			   sector < here->sector + (here->size >> 9)) {
+			overlap = here;
+			break;
+		} else if (sector >= here->sector) {
+			/* Overlap if any must be on right side */
+			node = node->rb_right;
+		} else
+			break;
+	}
+	return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+	sector_t end = sector + (size >> 9);
+	struct rb_node *node;
+
+	for (;;) {
+		node = rb_next(&i->rb);
+		if (!node)
+			return NULL;
+		i = rb_entry(node, struct drbd_interval, rb);
+		if (i->sector >= end)
+			return NULL;
+		if (sector < i->sector + (i->size >> 9))
+			return i;
+	}
+}
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 000000000..f210543f0
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,42 @@
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+	struct rb_node rb;
+	sector_t sector;	/* start sector of the interval */
+	unsigned int size;	/* size in bytes */
+	sector_t end;		/* highest interval end in subtree */
+	int local:1		/* local or remote request? */;
+	int waiting:1;		/* someone is waiting for this to complete */
+	int completed:1;	/* this has been completed already;
+				 * ignore for conflict detection */
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+	RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+	return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+				   struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+					unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+					unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size)		\
+	for (i = drbd_find_overlap(root, sector, size);		\
+	     i;							\
+	     i = drbd_next_overlap(i, sector, size))
+
+#endif  /* __DRBD_INTERVAL_H */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000..81fde9ef7
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3844 @@
+/*
+   drbd.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/drbd.h>
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+#include "drbd_vli.h"
+#include "drbd_debugfs.h"
+
+static DEFINE_MUTEX(drbd_main_mutex);
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static void drbd_release(struct gendisk *gd, fmode_t mode);
+static void md_sync_timer_fn(unsigned long data);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+	      "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
+		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* allow_open_on_secondary */
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * this becomes the boot parameter drbd.minor_count */
+module_param(minor_count, uint, 0444);
+module_param(disable_sendpage, bool, 0644);
+module_param(allow_oos, bool, 0);
+module_param(proc_details, int, 0644);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+int enable_faults;
+int fault_rate;
+static int fault_count;
+int fault_devs;
+/* bitmap of enabled faults */
+module_param(enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param(fault_rate, int, 0664);
+/* count of faults inserted */
+module_param(fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param(fault_devs, int, 0644);
+#endif
+
+/* module parameter, defined */
+unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
+bool disable_sendpage;
+bool allow_oos;
+int proc_details;       /* Detail level in proc drbd*/
+
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char usermode_helper[80] = "/sbin/drbdadm";
+
+module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct idr drbd_devices;
+struct list_head drbd_resources;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache;	/* peer requests */
+struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+mempool_t *drbd_request_mempool;
+mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
+
+/* I do not use a standard mempool, because:
+   1) I want to hand out the pre-allocated objects first.
+   2) I want to be able to interrupt sleeping allocation with a signal.
+   Note: This is a single linked list, the next pointer is the private
+	 member of struct page.
+ */
+struct page *drbd_pp_pool;
+spinlock_t   drbd_pp_lock;
+int          drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
+
+static const struct block_device_operations drbd_ops = {
+	.owner =   THIS_MODULE,
+	.open =    drbd_open,
+	.release = drbd_release,
+};
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	if (!drbd_md_io_bio_set)
+		return bio_alloc(gfp_mask, 1);
+
+	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	if (!bio)
+		return NULL;
+	return bio;
+}
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+   give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	atomic_inc(&device->local_cnt);
+	io_allowed = (device->state.disk >= mins);
+	if (!io_allowed) {
+		if (atomic_dec_and_test(&device->local_cnt))
+			wake_up(&device->misc_wait);
+	}
+	return io_allowed;
+}
+
+#endif
+
+/**
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @connection:	DRBD connection.
+ * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
+ * @set_size:	Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
+ */
+void tl_release(struct drbd_connection *connection, unsigned int barrier_nr,
+		unsigned int set_size)
+{
+	struct drbd_request *r;
+	struct drbd_request *req = NULL;
+	int expect_epoch = 0;
+	int expect_size = 0;
+
+	spin_lock_irq(&connection->resource->req_lock);
+
+	/* find oldest not yet barrier-acked write request,
+	 * count writes in its epoch. */
+	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
+		const unsigned s = r->rq_state;
+		if (!req) {
+			if (!(s & RQ_WRITE))
+				continue;
+			if (!(s & RQ_NET_MASK))
+				continue;
+			if (s & RQ_NET_DONE)
+				continue;
+			req = r;
+			expect_epoch = req->epoch;
+			expect_size ++;
+		} else {
+			if (r->epoch != expect_epoch)
+				break;
+			if (!(s & RQ_WRITE))
+				continue;
+			/* if (s & RQ_DONE): not expected */
+			/* if (!(s & RQ_NET_MASK)): not expected */
+			expect_size++;
+		}
+	}
+
+	/* first some paranoia code */
+	if (req == NULL) {
+		drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+			 barrier_nr);
+		goto bail;
+	}
+	if (expect_epoch != barrier_nr) {
+		drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
+			 barrier_nr, expect_epoch);
+		goto bail;
+	}
+
+	if (expect_size != set_size) {
+		drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+			 barrier_nr, set_size, expect_size);
+		goto bail;
+	}
+
+	/* Clean up list of requests processed during current epoch. */
+	/* this extra list walk restart is paranoia,
+	 * to catch requests being barrier-acked "unexpectedly".
+	 * It usually should find the same req again, or some READ preceding it. */
+	list_for_each_entry(req, &connection->transfer_log, tl_requests)
+		if (req->epoch == expect_epoch)
+			break;
+	list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
+		if (req->epoch != expect_epoch)
+			break;
+		_req_mod(req, BARRIER_ACKED);
+	}
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	return;
+
+bail:
+	spin_unlock_irq(&connection->resource->req_lock);
+	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @connection:	DRBD connection to operate on.
+ * @what:       The action/event to perform with all request objects
+ *
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
+ */
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+	struct drbd_request *req, *r;
+
+	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests)
+		_req_mod(req, what);
+}
+
+void tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+	spin_lock_irq(&connection->resource->req_lock);
+	_tl_restart(connection, what);
+	spin_unlock_irq(&connection->resource->req_lock);
+}
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @device:	DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_connection *connection)
+{
+	tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+}
+
+/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
+ * @device:	DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_request *req, *r;
+
+	spin_lock_irq(&connection->resource->req_lock);
+	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
+		if (!(req->rq_state & RQ_LOCAL_PENDING))
+			continue;
+		if (req->device != device)
+			continue;
+		_req_mod(req, ABORT_DISK_IO);
+	}
+	spin_unlock_irq(&connection->resource->req_lock);
+}
+
+static int drbd_thread_setup(void *arg)
+{
+	struct drbd_thread *thi = (struct drbd_thread *) arg;
+	struct drbd_resource *resource = thi->resource;
+	unsigned long flags;
+	int retval;
+
+	snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+		 thi->name[0],
+		 resource->name);
+
+restart:
+	retval = thi->function(thi);
+
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	/* if the receiver has been "EXITING", the last thing it did
+	 * was set the conn state to "StandAlone",
+	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+	 * and receiver thread will be "started".
+	 * drbd_thread_start needs to set "RESTARTING" in that case.
+	 * t_state check and assignment needs to be within the same spinlock,
+	 * so either thread_start sees EXITING, and can remap to RESTARTING,
+	 * or thread_start see NONE, and can proceed as normal.
+	 */
+
+	if (thi->t_state == RESTARTING) {
+		drbd_info(resource, "Restarting %s thread\n", thi->name);
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		goto restart;
+	}
+
+	thi->task = NULL;
+	thi->t_state = NONE;
+	smp_mb();
+	complete_all(&thi->stop);
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	drbd_info(resource, "Terminating %s\n", current->comm);
+
+	/* Release mod reference taken when thread was started */
+
+	if (thi->connection)
+		kref_put(&thi->connection->kref, drbd_destroy_connection);
+	kref_put(&resource->kref, drbd_destroy_resource);
+	module_put(THIS_MODULE);
+	return retval;
+}
+
+static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi,
+			     int (*func) (struct drbd_thread *), const char *name)
+{
+	spin_lock_init(&thi->t_lock);
+	thi->task    = NULL;
+	thi->t_state = NONE;
+	thi->function = func;
+	thi->resource = resource;
+	thi->connection = NULL;
+	thi->name = name;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+	struct drbd_resource *resource = thi->resource;
+	struct task_struct *nt;
+	unsigned long flags;
+
+	/* is used from state engine doing drbd_thread_stop_nowait,
+	 * while holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	switch (thi->t_state) {
+	case NONE:
+		drbd_info(resource, "Starting %s thread (from %s [%d])\n",
+			 thi->name, current->comm, current->pid);
+
+		/* Get ref on module for thread - this is released when thread exits */
+		if (!try_module_get(THIS_MODULE)) {
+			drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return false;
+		}
+
+		kref_get(&resource->kref);
+		if (thi->connection)
+			kref_get(&thi->connection->kref);
+
+		init_completion(&thi->stop);
+		thi->reset_cpu_mask = 1;
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+		nt = kthread_create(drbd_thread_setup, (void *) thi,
+				    "drbd_%c_%s", thi->name[0], thi->resource->name);
+
+		if (IS_ERR(nt)) {
+			drbd_err(resource, "Couldn't start thread\n");
+
+			if (thi->connection)
+				kref_put(&thi->connection->kref, drbd_destroy_connection);
+			kref_put(&resource->kref, drbd_destroy_resource);
+			module_put(THIS_MODULE);
+			return false;
+		}
+		spin_lock_irqsave(&thi->t_lock, flags);
+		thi->task = nt;
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		wake_up_process(nt);
+		break;
+	case EXITING:
+		thi->t_state = RESTARTING;
+		drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
+				thi->name, current->comm, current->pid);
+		/* fall through */
+	case RUNNING:
+	case RESTARTING:
+	default:
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		break;
+	}
+
+	return true;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+	unsigned long flags;
+
+	enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
+
+	/* may be called from state engine, holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	if (thi->t_state == NONE) {
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		if (restart)
+			drbd_thread_start(thi);
+		return;
+	}
+
+	if (thi->t_state != ns) {
+		if (thi->task == NULL) {
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return;
+		}
+
+		thi->t_state = ns;
+		smp_mb();
+		init_completion(&thi->stop);
+		if (thi->task != current)
+			force_sig(DRBD_SIGKILL, thi->task);
+	}
+
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	if (wait)
+		wait_for_completion(&thi->stop);
+}
+
+int conn_lowest_minor(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr = 0, minor = -1;
+
+	rcu_read_lock();
+	peer_device = idr_get_next(&connection->peer_devices, &vnr);
+	if (peer_device)
+		minor = device_to_minor(peer_device->device);
+	rcu_read_unlock();
+
+	return minor;
+}
+
+#ifdef CONFIG_SMP
+/**
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ *
+ * Forces all threads of a resource onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
+{
+	unsigned int *resources_per_cpu, min_index = ~0;
+
+	resources_per_cpu = kzalloc(nr_cpu_ids * sizeof(*resources_per_cpu), GFP_KERNEL);
+	if (resources_per_cpu) {
+		struct drbd_resource *resource;
+		unsigned int cpu, min = ~0;
+
+		rcu_read_lock();
+		for_each_resource_rcu(resource, &drbd_resources) {
+			for_each_cpu(cpu, resource->cpu_mask)
+				resources_per_cpu[cpu]++;
+		}
+		rcu_read_unlock();
+		for_each_online_cpu(cpu) {
+			if (resources_per_cpu[cpu] < min) {
+				min = resources_per_cpu[cpu];
+				min_index = cpu;
+			}
+		}
+		kfree(resources_per_cpu);
+	}
+	if (min_index == ~0) {
+		cpumask_setall(*cpu_mask);
+		return;
+	}
+	cpumask_set_cpu(min_index, *cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @device:	DRBD device.
+ * @thi:	drbd_thread object
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
+{
+	struct drbd_resource *resource = thi->resource;
+	struct task_struct *p = current;
+
+	if (!thi->reset_cpu_mask)
+		return;
+	thi->reset_cpu_mask = 0;
+	set_cpus_allowed_ptr(p, resource->cpu_mask);
+}
+#else
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+
+/**
+ * drbd_header_size  -  size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures.  (The bitmap send and receive code
+ * relies on this.)
+ */
+unsigned int drbd_header_size(struct drbd_connection *connection)
+{
+	if (connection->agreed_pro_version >= 100) {
+		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+		return sizeof(struct p_header100);
+	} else {
+		BUILD_BUG_ON(sizeof(struct p_header80) !=
+			     sizeof(struct p_header95));
+		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+		return sizeof(struct p_header80);
+	}
+}
+
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+{
+	h->magic   = cpu_to_be32(DRBD_MAGIC);
+	h->command = cpu_to_be16(cmd);
+	h->length  = cpu_to_be16(size);
+	return sizeof(struct p_header80);
+}
+
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+	h->magic   = cpu_to_be16(DRBD_MAGIC_BIG);
+	h->command = cpu_to_be16(cmd);
+	h->length = cpu_to_be32(size);
+	return sizeof(struct p_header95);
+}
+
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+				      int size, int vnr)
+{
+	h->magic = cpu_to_be32(DRBD_MAGIC_100);
+	h->volume = cpu_to_be16(vnr);
+	h->command = cpu_to_be16(cmd);
+	h->length = cpu_to_be32(size);
+	h->pad = 0;
+	return sizeof(struct p_header100);
+}
+
+static unsigned int prepare_header(struct drbd_connection *connection, int vnr,
+				   void *buffer, enum drbd_packet cmd, int size)
+{
+	if (connection->agreed_pro_version >= 100)
+		return prepare_header100(buffer, cmd, size, vnr);
+	else if (connection->agreed_pro_version >= 95 &&
+		 size > DRBD_MAX_SIZE_H80_PACKET)
+		return prepare_header95(buffer, cmd, size);
+	else
+		return prepare_header80(buffer, cmd, size);
+}
+
+static void *__conn_prepare_command(struct drbd_connection *connection,
+				    struct drbd_socket *sock)
+{
+	if (!sock->socket)
+		return NULL;
+	return sock->sbuf + drbd_header_size(connection);
+}
+
+void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock)
+{
+	void *p;
+
+	mutex_lock(&sock->mutex);
+	p = __conn_prepare_command(connection, sock);
+	if (!p)
+		mutex_unlock(&sock->mutex);
+
+	return p;
+}
+
+void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock)
+{
+	return conn_prepare_command(peer_device->connection, sock);
+}
+
+static int __send_command(struct drbd_connection *connection, int vnr,
+			  struct drbd_socket *sock, enum drbd_packet cmd,
+			  unsigned int header_size, void *data,
+			  unsigned int size)
+{
+	int msg_flags;
+	int err;
+
+	/*
+	 * Called with @data == NULL and the size of the data blocks in @size
+	 * for commands that send data blocks.  For those commands, omit the
+	 * MSG_MORE flag: this will increase the likelihood that data blocks
+	 * which are page aligned on the sender will end up page aligned on the
+	 * receiver.
+	 */
+	msg_flags = data ? MSG_MORE : 0;
+
+	header_size += prepare_header(connection, vnr, sock->sbuf, cmd,
+				      header_size + size);
+	err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
+			    msg_flags);
+	if (data && !err)
+		err = drbd_send_all(connection, sock->socket, data, size, 0);
+	/* DRBD protocol "pings" are latency critical.
+	 * This is supposed to trigger tcp_push_pending_frames() */
+	if (!err && (cmd == P_PING || cmd == P_PING_ACK))
+		drbd_tcp_nodelay(sock->socket);
+
+	return err;
+}
+
+static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+			       enum drbd_packet cmd, unsigned int header_size,
+			       void *data, unsigned int size)
+{
+	return __send_command(connection, 0, sock, cmd, header_size, data, size);
+}
+
+int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+		      enum drbd_packet cmd, unsigned int header_size,
+		      void *data, unsigned int size)
+{
+	int err;
+
+	err = __conn_send_command(connection, sock, cmd, header_size, data, size);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock,
+		      enum drbd_packet cmd, unsigned int header_size,
+		      void *data, unsigned int size)
+{
+	int err;
+
+	err = __send_command(peer_device->connection, peer_device->device->vnr,
+			     sock, cmd, header_size, data, size);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+int drbd_send_ping(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+
+	sock = &connection->meta;
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, P_PING, 0, NULL, 0);
+}
+
+int drbd_send_ping_ack(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+
+	sock = &connection->meta;
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0);
+}
+
+int drbd_send_sync_param(struct drbd_peer_device *peer_device)
+{
+	struct drbd_socket *sock;
+	struct p_rs_param_95 *p;
+	int size;
+	const int apv = peer_device->connection->agreed_pro_version;
+	enum drbd_packet cmd;
+	struct net_conf *nc;
+	struct disk_conf *dc;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+
+	size = apv <= 87 ? sizeof(struct p_rs_param)
+		: apv == 88 ? sizeof(struct p_rs_param)
+			+ strlen(nc->verify_alg) + 1
+		: apv <= 94 ? sizeof(struct p_rs_param_89)
+		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+	cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+	/* initialize verify_alg and csums_alg */
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+	if (get_ldev(peer_device->device)) {
+		dc = rcu_dereference(peer_device->device->ldev->disk_conf);
+		p->resync_rate = cpu_to_be32(dc->resync_rate);
+		p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+		p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+		p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+		p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+		put_ldev(peer_device->device);
+	} else {
+		p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+		p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+		p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+		p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+		p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+	}
+
+	if (apv >= 88)
+		strcpy(p->verify_alg, nc->verify_alg);
+	if (apv >= 89)
+		strcpy(p->csums_alg, nc->csums_alg);
+	rcu_read_unlock();
+
+	return drbd_send_command(peer_device, sock, cmd, size, NULL, 0);
+}
+
+int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd)
+{
+	struct drbd_socket *sock;
+	struct p_protocol *p;
+	struct net_conf *nc;
+	int size, cf;
+
+	sock = &connection->data;
+	p = __conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+
+	if (nc->tentative && connection->agreed_pro_version < 92) {
+		rcu_read_unlock();
+		mutex_unlock(&sock->mutex);
+		drbd_err(connection, "--dry-run is not supported by peer");
+		return -EOPNOTSUPP;
+	}
+
+	size = sizeof(*p);
+	if (connection->agreed_pro_version >= 87)
+		size += strlen(nc->integrity_alg) + 1;
+
+	p->protocol      = cpu_to_be32(nc->wire_protocol);
+	p->after_sb_0p   = cpu_to_be32(nc->after_sb_0p);
+	p->after_sb_1p   = cpu_to_be32(nc->after_sb_1p);
+	p->after_sb_2p   = cpu_to_be32(nc->after_sb_2p);
+	p->two_primaries = cpu_to_be32(nc->two_primaries);
+	cf = 0;
+	if (nc->discard_my_data)
+		cf |= CF_DISCARD_MY_DATA;
+	if (nc->tentative)
+		cf |= CF_DRY_RUN;
+	p->conn_flags    = cpu_to_be32(cf);
+
+	if (connection->agreed_pro_version >= 87)
+		strcpy(p->integrity_alg, nc->integrity_alg);
+	rcu_read_unlock();
+
+	return __conn_send_command(connection, sock, cmd, size, NULL, 0);
+}
+
+int drbd_send_protocol(struct drbd_connection *connection)
+{
+	int err;
+
+	mutex_lock(&connection->data.mutex);
+	err = __drbd_send_protocol(connection, P_PROTOCOL);
+	mutex_unlock(&connection->data.mutex);
+
+	return err;
+}
+
+static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_uuids *p;
+	int i;
+
+	if (!get_ldev_if_state(device, D_NEGOTIATING))
+		return 0;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p) {
+		put_ldev(device);
+		return -EIO;
+	}
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	device->comm_bm_set = drbd_bm_total_weight(device);
+	p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
+	rcu_read_lock();
+	uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0;
+	rcu_read_unlock();
+	uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0;
+	uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+	p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+	put_ldev(device);
+	return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_uuids(struct drbd_peer_device *peer_device)
+{
+	return _drbd_send_uuids(peer_device, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
+{
+	return _drbd_send_uuids(peer_device, 8);
+}
+
+void drbd_print_uuids(struct drbd_device *device, const char *text)
+{
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		u64 *uuid = device->ldev->md.uuid;
+		drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
+		     text,
+		     (unsigned long long)uuid[UI_CURRENT],
+		     (unsigned long long)uuid[UI_BITMAP],
+		     (unsigned long long)uuid[UI_HISTORY_START],
+		     (unsigned long long)uuid[UI_HISTORY_END]);
+		put_ldev(device);
+	} else {
+		drbd_info(device, "%s effective data uuid: %016llX\n",
+				text,
+				(unsigned long long)device->ed_uuid);
+	}
+}
+
+void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_rs_uuid *p;
+	u64 uuid;
+
+	D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
+
+	uuid = device->ldev->md.uuid[UI_BITMAP];
+	if (uuid && uuid != UUID_JUST_CREATED)
+		uuid = uuid + UUID_NEW_BM_OFFSET;
+	else
+		get_random_bytes(&uuid, sizeof(u64));
+	drbd_uuid_set(device, UI_BITMAP, uuid);
+	drbd_print_uuids(device, "updated sync UUID");
+	drbd_md_sync(device);
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (p) {
+		p->uuid = cpu_to_be64(uuid);
+		drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+	}
+}
+
+int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_sizes *p;
+	sector_t d_size, u_size;
+	int q_order_type;
+	unsigned int max_bio_size;
+
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		D_ASSERT(device, device->ldev->backing_bdev);
+		d_size = drbd_get_max_capacity(device->ldev);
+		rcu_read_lock();
+		u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+		rcu_read_unlock();
+		q_order_type = drbd_queue_order_type(device);
+		max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
+		put_ldev(device);
+	} else {
+		d_size = 0;
+		u_size = 0;
+		q_order_type = QUEUE_ORDERED_NONE;
+		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
+	}
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+
+	if (peer_device->connection->agreed_pro_version <= 94)
+		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+	else if (peer_device->connection->agreed_pro_version < 100)
+		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
+
+	p->d_size = cpu_to_be64(d_size);
+	p->u_size = cpu_to_be64(u_size);
+	p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev));
+	p->max_bio_size = cpu_to_be32(max_bio_size);
+	p->queue_order_type = cpu_to_be16(q_order_type);
+	p->dds_flags = cpu_to_be16(flags);
+	return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_current_state() - Sends the drbd state to the peer
+ * @peer_device:	DRBD peer device.
+ */
+int drbd_send_current_state(struct drbd_peer_device *peer_device)
+{
+	struct drbd_socket *sock;
+	struct p_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */
+	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @peer_device:      DRBD peer device.
+ * @state:     the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state)
+{
+	struct drbd_socket *sock;
+	struct p_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->state = cpu_to_be32(state.i); /* Within the send mutex */
+	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val)
+{
+	struct drbd_socket *sock;
+	struct p_req_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->mask = cpu_to_be32(mask.i);
+	p->val = cpu_to_be32(val.i);
+	return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
+
+int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+	enum drbd_packet cmd;
+	struct drbd_socket *sock;
+	struct p_req_state *p;
+
+	cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	p->mask = cpu_to_be32(mask.i);
+	p->val = cpu_to_be32(val.i);
+	return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode)
+{
+	struct drbd_socket *sock;
+	struct p_req_state_reply *p;
+
+	sock = &peer_device->connection->meta;
+	p = drbd_prepare_command(peer_device, sock);
+	if (p) {
+		p->retcode = cpu_to_be32(retcode);
+		drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+	}
+}
+
+void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode)
+{
+	struct drbd_socket *sock;
+	struct p_req_state_reply *p;
+	enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
+
+	sock = &connection->meta;
+	p = conn_prepare_command(connection, sock);
+	if (p) {
+		p->retcode = cpu_to_be32(retcode);
+		conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+	}
+}
+
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+	BUG_ON(code & ~0xf);
+	p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+	BUG_ON(n & ~0x7);
+	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+static int fill_bitmap_rle_bits(struct drbd_device *device,
+			 struct p_compressed_bm *p,
+			 unsigned int size,
+			 struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	unsigned long plain_bits;
+	unsigned long tmp;
+	unsigned long rl;
+	unsigned len;
+	unsigned toggle;
+	int bits, use_rle;
+
+	/* may we use this feature? */
+	rcu_read_lock();
+	use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
+	rcu_read_unlock();
+	if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90)
+		return 0;
+
+	if (c->bit_offset >= c->bm_bits)
+		return 0; /* nothing to do. */
+
+	/* use at most thus many bytes */
+	bitstream_init(&bs, p->code, size, 0);
+	memset(p->code, 0, size);
+	/* plain bits covered in this code string */
+	plain_bits = 0;
+
+	/* p->encoding & 0x80 stores whether the first run length is set.
+	 * bit offset is implicit.
+	 * start with toggle == 2 to be able to tell the first iteration */
+	toggle = 2;
+
+	/* see how much plain bits we can stuff into one packet
+	 * using RLE and VLI. */
+	do {
+		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset)
+				    : _drbd_bm_find_next(device, c->bit_offset);
+		if (tmp == -1UL)
+			tmp = c->bm_bits;
+		rl = tmp - c->bit_offset;
+
+		if (toggle == 2) { /* first iteration */
+			if (rl == 0) {
+				/* the first checked bit was set,
+				 * store start value, */
+				dcbp_set_start(p, 1);
+				/* but skip encoding of zero run length */
+				toggle = !toggle;
+				continue;
+			}
+			dcbp_set_start(p, 0);
+		}
+
+		/* paranoia: catch zero runlength.
+		 * can only happen if bitmap is modified while we scan it. */
+		if (rl == 0) {
+			drbd_err(device, "unexpected zero runlength while encoding bitmap "
+			    "t:%u bo:%lu\n", toggle, c->bit_offset);
+			return -1;
+		}
+
+		bits = vli_encode_bits(&bs, rl);
+		if (bits == -ENOBUFS) /* buffer full */
+			break;
+		if (bits <= 0) {
+			drbd_err(device, "error while encoding bitmap: %d\n", bits);
+			return 0;
+		}
+
+		toggle = !toggle;
+		plain_bits += rl;
+		c->bit_offset = tmp;
+	} while (c->bit_offset < c->bm_bits);
+
+	len = bs.cur.b - p->code + !!bs.cur.bit;
+
+	if (plain_bits < (len << 3)) {
+		/* incompressible with this method.
+		 * we need to rewind both word and bit position. */
+		c->bit_offset -= plain_bits;
+		bm_xfer_ctx_bit_to_word_offset(c);
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+		return 0;
+	}
+
+	/* RLE + VLI was able to compress it just fine.
+	 * update c->word_offset. */
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	/* store pad_bits */
+	dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+	return len;
+}
+
+/**
+ * send_bitmap_rle_or_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c)
+{
+	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+	struct p_compressed_bm *p = sock->sbuf + header_size;
+	int len, err;
+
+	len = fill_bitmap_rle_bits(device, p,
+			DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
+	if (len < 0)
+		return -EIO;
+
+	if (len) {
+		dcbp_set_code(p, RLE_VLI_Bits);
+		err = __send_command(first_peer_device(device)->connection, device->vnr, sock,
+				     P_COMPRESSED_BITMAP, sizeof(*p) + len,
+				     NULL, 0);
+		c->packets[0]++;
+		c->bytes[0] += header_size + sizeof(*p) + len;
+
+		if (c->bit_offset >= c->bm_bits)
+			len = 0; /* DONE */
+	} else {
+		/* was not compressible.
+		 * send a buffer full of plain text bits instead. */
+		unsigned int data_size;
+		unsigned long num_words;
+		unsigned long *p = sock->sbuf + header_size;
+
+		data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+		num_words = min_t(size_t, data_size / sizeof(*p),
+				  c->bm_words - c->word_offset);
+		len = num_words * sizeof(*p);
+		if (len)
+			drbd_bm_get_lel(device, c->word_offset, num_words, p);
+		err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0);
+		c->word_offset += num_words;
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+		c->packets[1]++;
+		c->bytes[1] += header_size + len;
+
+		if (c->bit_offset > c->bm_bits)
+			c->bit_offset = c->bm_bits;
+	}
+	if (!err) {
+		if (len == 0) {
+			INFO_bm_xfer_stats(device, "send", c);
+			return 0;
+		} else
+			return 1;
+	}
+	return -EIO;
+}
+
+/* See the comment at receive_bitmap() */
+static int _drbd_send_bitmap(struct drbd_device *device)
+{
+	struct bm_xfer_ctx c;
+	int err;
+
+	if (!expect(device->bitmap))
+		return false;
+
+	if (get_ldev(device)) {
+		if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
+			drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
+			drbd_bm_set_all(device);
+			if (drbd_bm_write(device)) {
+				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
+				 * but otherwise process as per normal - need to tell other
+				 * side that a full resync is required! */
+				drbd_err(device, "Failed to write bitmap to disk!\n");
+			} else {
+				drbd_md_clear_flag(device, MDF_FULL_SYNC);
+				drbd_md_sync(device);
+			}
+		}
+		put_ldev(device);
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(device),
+		.bm_words = drbd_bm_words(device),
+	};
+
+	do {
+		err = send_bitmap_rle_or_plain(device, &c);
+	} while (err > 0);
+
+	return err == 0;
+}
+
+int drbd_send_bitmap(struct drbd_device *device)
+{
+	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+	int err = -1;
+
+	mutex_lock(&sock->mutex);
+	if (sock->socket)
+		err = !_drbd_send_bitmap(device);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
+{
+	struct drbd_socket *sock;
+	struct p_barrier_ack *p;
+
+	if (connection->cstate < C_WF_REPORT_PARAMS)
+		return;
+
+	sock = &connection->meta;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return;
+	p->barrier = barrier_nr;
+	p->set_size = cpu_to_be32(set_size);
+	conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @device:	DRBD device.
+ * @cmd:	Packet command code.
+ * @sector:	sector, needs to be in big endian byte order
+ * @blksize:	size in byte, needs to be in big endian byte order
+ * @block_id:	Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+			  u64 sector, u32 blksize, u64 block_id)
+{
+	struct drbd_socket *sock;
+	struct p_block_ack *p;
+
+	if (peer_device->device->state.conn < C_CONNECTED)
+		return -EIO;
+
+	sock = &peer_device->connection->meta;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = sector;
+	p->block_id = block_id;
+	p->blksize = blksize;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
+void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		      struct p_data *dp, int data_size)
+{
+	if (peer_device->connection->peer_integrity_tfm)
+		data_size -= crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+	_drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size),
+		       dp->block_id);
+}
+
+void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		      struct p_block_req *rp)
+{
+	_drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @device:	DRBD device
+ * @cmd:	packet command code
+ * @peer_req:	peer request
+ */
+int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		  struct drbd_peer_request *peer_req)
+{
+	return _drbd_send_ack(peer_device, cmd,
+			      cpu_to_be64(peer_req->i.sector),
+			      cpu_to_be32(peer_req->i.size),
+			      peer_req->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		     sector_t sector, int blksize, u64 block_id)
+{
+	return _drbd_send_ack(peer_device, cmd,
+			      cpu_to_be64(sector),
+			      cpu_to_be32(blksize),
+			      cpu_to_be64(block_id));
+}
+
+int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
+		       sector_t sector, int size, u64 block_id)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = block_id;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size,
+			    void *digest, int digest_size, enum drbd_packet cmd)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	/* FIXME: Put the digest into the preallocated socket buffer.  */
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = ID_SYNCER /* unused */;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size);
+}
+
+int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = ID_SYNCER /* unused */;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
+}
+
+/* called on sndtimeo
+ * returns false if we should retry,
+ * true if we think connection is dead
+ */
+static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock)
+{
+	int drop_it;
+	/* long elapsed = (long)(jiffies - device->last_received); */
+
+	drop_it =   connection->meta.socket == sock
+		|| !connection->asender.task
+		|| get_t_state(&connection->asender) != RUNNING
+		|| connection->cstate < C_WF_REPORT_PARAMS;
+
+	if (drop_it)
+		return true;
+
+	drop_it = !--connection->ko_count;
+	if (!drop_it) {
+		drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+			 current->comm, current->pid, connection->ko_count);
+		request_ping(connection);
+	}
+
+	return drop_it; /* && (device->state == R_PRIMARY) */;
+}
+
+static void drbd_update_congested(struct drbd_connection *connection)
+{
+	struct sock *sk = connection->data.socket->sk;
+	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+		set_bit(NET_CONGESTED, &connection->flags);
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ *   reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page,
+			      int offset, size_t size, unsigned msg_flags)
+{
+	struct socket *socket;
+	void *addr;
+	int err;
+
+	socket = peer_device->connection->data.socket;
+	addr = kmap(page) + offset;
+	err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
+	kunmap(page);
+	if (!err)
+		peer_device->device->send_cnt += size >> 9;
+	return err;
+}
+
+static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page,
+		    int offset, size_t size, unsigned msg_flags)
+{
+	struct socket *socket = peer_device->connection->data.socket;
+	mm_segment_t oldfs = get_fs();
+	int len = size;
+	int err = -EIO;
+
+	/* e.g. XFS meta- & log-data is in slab pages, which have a
+	 * page_count of 0 and/or have PageSlab() set.
+	 * we cannot use send_page for those, as that does get_page();
+	 * put_page(); and would cause either a VM_BUG directly, or
+	 * __page_cache_release a page that would actually still be referenced
+	 * by someone, leading to some obscure delayed Oops somewhere else. */
+	if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
+		return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
+
+	msg_flags |= MSG_NOSIGNAL;
+	drbd_update_congested(peer_device->connection);
+	set_fs(KERNEL_DS);
+	do {
+		int sent;
+
+		sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
+		if (sent <= 0) {
+			if (sent == -EAGAIN) {
+				if (we_should_drop_the_connection(peer_device->connection, socket))
+					break;
+				continue;
+			}
+			drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
+			     __func__, (int)size, len, sent);
+			if (sent < 0)
+				err = sent;
+			break;
+		}
+		len    -= sent;
+		offset += sent;
+	} while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/);
+	set_fs(oldfs);
+	clear_bit(NET_CONGESTED, &peer_device->connection->flags);
+
+	if (len == 0) {
+		err = 0;
+		peer_device->device->send_cnt += size >> 9;
+	}
+	return err;
+}
+
+static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	/* hint all but last page with MSG_MORE */
+	bio_for_each_segment(bvec, bio, iter) {
+		int err;
+
+		err = _drbd_no_send_page(peer_device, bvec.bv_page,
+					 bvec.bv_offset, bvec.bv_len,
+					 bio_iter_last(bvec, iter)
+					 ? 0 : MSG_MORE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	/* hint all but last page with MSG_MORE */
+	bio_for_each_segment(bvec, bio, iter) {
+		int err;
+
+		err = _drbd_send_page(peer_device, bvec.bv_page,
+				      bvec.bv_offset, bvec.bv_len,
+				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
+			    struct drbd_peer_request *peer_req)
+{
+	struct page *page = peer_req->pages;
+	unsigned len = peer_req->i.size;
+	int err;
+
+	/* hint all but last page with MSG_MORE */
+	page_chain_for_each(page) {
+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
+
+		err = _drbd_send_page(peer_device, page, 0, l,
+				      page_chain_next(page) ? MSG_MORE : 0);
+		if (err)
+			return err;
+		len -= l;
+	}
+	return 0;
+}
+
+static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw)
+{
+	if (connection->agreed_pro_version >= 95)
+		return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
+			(bi_rw & REQ_FUA ? DP_FUA : 0) |
+			(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
+			(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
+	else
+		return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
+}
+
+/* Used to send write or TRIM aka REQ_DISCARD requests
+ * R_PRIMARY -> Peer	(P_DATA, P_TRIM)
+ */
+int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_data *p;
+	unsigned int dp_flags = 0;
+	int digest_size;
+	int err;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	digest_size = peer_device->connection->integrity_tfm ?
+		      crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(req->i.sector);
+	p->block_id = (unsigned long)req;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
+	dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw);
+	if (device->state.conn >= C_SYNC_SOURCE &&
+	    device->state.conn <= C_PAUSED_SYNC_T)
+		dp_flags |= DP_MAY_SET_IN_SYNC;
+	if (peer_device->connection->agreed_pro_version >= 100) {
+		if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+		/* During resync, request an explicit write ack,
+		 * even in protocol != C */
+		if (req->rq_state & RQ_EXP_WRITE_ACK
+		|| (dp_flags & DP_MAY_SET_IN_SYNC))
+			dp_flags |= DP_SEND_WRITE_ACK;
+	}
+	p->dp_flags = cpu_to_be32(dp_flags);
+
+	if (dp_flags & DP_DISCARD) {
+		struct p_trim *t = (struct p_trim*)p;
+		t->size = cpu_to_be32(req->i.size);
+		err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
+		goto out;
+	}
+
+	/* our digest is still only over the payload.
+	 * TRIM does not carry any payload. */
+	if (digest_size)
+		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
+	err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size);
+	if (!err) {
+		/* For protocol A, we have to memcpy the payload into
+		 * socket buffers, as we may complete right away
+		 * as soon as we handed it over to tcp, at which point the data
+		 * pages may become invalid.
+		 *
+		 * For data-integrity enabled, we copy it as well, so we can be
+		 * sure that even if the bio pages may still be modified, it
+		 * won't change the data on the wire, thus if the digest checks
+		 * out ok after sending on this side, but does not fit on the
+		 * receiving side, we sure have detected corruption elsewhere.
+		 */
+		if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size)
+			err = _drbd_send_bio(peer_device, req->master_bio);
+		else
+			err = _drbd_send_zc_bio(peer_device, req->master_bio);
+
+		/* double check digest, sometimes buffers have been modified in flight. */
+		if (digest_size > 0 && digest_size <= 64) {
+			/* 64 byte, 512 bit, is the largest digest size
+			 * currently supported in kernel crypto. */
+			unsigned char digest[64];
+			drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
+			if (memcmp(p + 1, digest, digest_size)) {
+				drbd_warn(device,
+					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
+					(unsigned long long)req->i.sector, req->i.size);
+			}
+		} /* else if (digest_size > 64) {
+		     ... Be noisy about digest too large ...
+		} */
+	}
+out:
+	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
+
+	return err;
+}
+
+/* answer packet, used to send data back for read requests:
+ *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
+ *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		    struct drbd_peer_request *peer_req)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_data *p;
+	int err;
+	int digest_size;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+
+	digest_size = peer_device->connection->integrity_tfm ?
+		      crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(peer_req->i.sector);
+	p->block_id = peer_req->block_id;
+	p->seq_num = 0;  /* unused */
+	p->dp_flags = 0;
+	if (digest_size)
+		drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1);
+	err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size);
+	if (!err)
+		err = _drbd_send_zc_ee(peer_device, peer_req);
+	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
+
+	return err;
+}
+
+int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_socket *sock;
+	struct p_block_desc *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(req->i.sector);
+	p->blksize = cpu_to_be32(req->i.size);
+	return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
+}
+
+/*
+  drbd_send distinguishes two cases:
+
+  Packets sent via the data socket "sock"
+  and packets sent via the meta data socket "msock"
+
+		    sock                      msock
+  -----------------+-------------------------+------------------------------
+  timeout           conf.timeout / 2          conf.timeout / 2
+  timeout action    send a ping via msock     Abort communication
+					      and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_connection *connection, struct socket *sock,
+	      void *buf, size_t size, unsigned msg_flags)
+{
+	struct kvec iov;
+	struct msghdr msg;
+	int rv, sent = 0;
+
+	if (!sock)
+		return -EBADR;
+
+	/* THINK  if (signal_pending) return ... ? */
+
+	iov.iov_base = buf;
+	iov.iov_len  = size;
+
+	msg.msg_name       = NULL;
+	msg.msg_namelen    = 0;
+	msg.msg_control    = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
+
+	if (sock == connection->data.socket) {
+		rcu_read_lock();
+		connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
+		rcu_read_unlock();
+		drbd_update_congested(connection);
+	}
+	do {
+		/* STRANGE
+		 * tcp_sendmsg does _not_ use its size parameter at all ?
+		 *
+		 * -EAGAIN on timeout, -EINTR on signal.
+		 */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+		if (rv == -EAGAIN) {
+			if (we_should_drop_the_connection(connection, sock))
+				break;
+			else
+				continue;
+		}
+		if (rv == -EINTR) {
+			flush_signals(current);
+			rv = 0;
+		}
+		if (rv < 0)
+			break;
+		sent += rv;
+		iov.iov_base += rv;
+		iov.iov_len  -= rv;
+	} while (sent < size);
+
+	if (sock == connection->data.socket)
+		clear_bit(NET_CONGESTED, &connection->flags);
+
+	if (rv <= 0) {
+		if (rv != -EAGAIN) {
+			drbd_err(connection, "%s_sendmsg returned %d\n",
+				 sock == connection->meta.socket ? "msock" : "sock",
+				 rv);
+			conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+		} else
+			conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+	}
+
+	return sent;
+}
+
+/**
+ * drbd_send_all  -  Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer,
+		  size_t size, unsigned msg_flags)
+{
+	int err;
+
+	err = drbd_send(connection, sock, buffer, size, msg_flags);
+	if (err < 0)
+		return err;
+	if (err != size)
+		return -EIO;
+	return 0;
+}
+
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct drbd_device *device = bdev->bd_disk->private_data;
+	unsigned long flags;
+	int rv = 0;
+
+	mutex_lock(&drbd_main_mutex);
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	/* to have a stable device->state.role
+	 * and no race with updating open_cnt */
+
+	if (device->state.role != R_PRIMARY) {
+		if (mode & FMODE_WRITE)
+			rv = -EROFS;
+		else if (!allow_oos)
+			rv = -EMEDIUMTYPE;
+	}
+
+	if (!rv)
+		device->open_cnt++;
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	mutex_unlock(&drbd_main_mutex);
+
+	return rv;
+}
+
+static void drbd_release(struct gendisk *gd, fmode_t mode)
+{
+	struct drbd_device *device = gd->private_data;
+	mutex_lock(&drbd_main_mutex);
+	device->open_cnt--;
+	mutex_unlock(&drbd_main_mutex);
+}
+
+static void drbd_set_defaults(struct drbd_device *device)
+{
+	/* Beware! The actual layout differs
+	 * between big endian and little endian */
+	device->state = (union drbd_dev_state) {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = C_STANDALONE,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		} };
+}
+
+void drbd_init_set_defaults(struct drbd_device *device)
+{
+	/* the memset(,0,) did most of this.
+	 * note: only assignments, no allocation in here */
+
+	drbd_set_defaults(device);
+
+	atomic_set(&device->ap_bio_cnt, 0);
+	atomic_set(&device->ap_actlog_cnt, 0);
+	atomic_set(&device->ap_pending_cnt, 0);
+	atomic_set(&device->rs_pending_cnt, 0);
+	atomic_set(&device->unacked_cnt, 0);
+	atomic_set(&device->local_cnt, 0);
+	atomic_set(&device->pp_in_use_by_net, 0);
+	atomic_set(&device->rs_sect_in, 0);
+	atomic_set(&device->rs_sect_ev, 0);
+	atomic_set(&device->ap_in_flight, 0);
+	atomic_set(&device->md_io.in_use, 0);
+
+	mutex_init(&device->own_state_mutex);
+	device->state_mutex = &device->own_state_mutex;
+
+	spin_lock_init(&device->al_lock);
+	spin_lock_init(&device->peer_seq_lock);
+
+	INIT_LIST_HEAD(&device->active_ee);
+	INIT_LIST_HEAD(&device->sync_ee);
+	INIT_LIST_HEAD(&device->done_ee);
+	INIT_LIST_HEAD(&device->read_ee);
+	INIT_LIST_HEAD(&device->net_ee);
+	INIT_LIST_HEAD(&device->resync_reads);
+	INIT_LIST_HEAD(&device->resync_work.list);
+	INIT_LIST_HEAD(&device->unplug_work.list);
+	INIT_LIST_HEAD(&device->bm_io_work.w.list);
+	INIT_LIST_HEAD(&device->pending_master_completion[0]);
+	INIT_LIST_HEAD(&device->pending_master_completion[1]);
+	INIT_LIST_HEAD(&device->pending_completion[0]);
+	INIT_LIST_HEAD(&device->pending_completion[1]);
+
+	device->resync_work.cb  = w_resync_timer;
+	device->unplug_work.cb  = w_send_write_hint;
+	device->bm_io_work.w.cb = w_bitmap_io;
+
+	init_timer(&device->resync_timer);
+	init_timer(&device->md_sync_timer);
+	init_timer(&device->start_resync_timer);
+	init_timer(&device->request_timer);
+	device->resync_timer.function = resync_timer_fn;
+	device->resync_timer.data = (unsigned long) device;
+	device->md_sync_timer.function = md_sync_timer_fn;
+	device->md_sync_timer.data = (unsigned long) device;
+	device->start_resync_timer.function = start_resync_timer_fn;
+	device->start_resync_timer.data = (unsigned long) device;
+	device->request_timer.function = request_timer_fn;
+	device->request_timer.data = (unsigned long) device;
+
+	init_waitqueue_head(&device->misc_wait);
+	init_waitqueue_head(&device->state_wait);
+	init_waitqueue_head(&device->ee_wait);
+	init_waitqueue_head(&device->al_wait);
+	init_waitqueue_head(&device->seq_wait);
+
+	device->resync_wenr = LC_FREE;
+	device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+	device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+}
+
+void drbd_device_cleanup(struct drbd_device *device)
+{
+	int i;
+	if (first_peer_device(device)->connection->receiver.t_state != NONE)
+		drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+				first_peer_device(device)->connection->receiver.t_state);
+
+	device->al_writ_cnt  =
+	device->bm_writ_cnt  =
+	device->read_cnt     =
+	device->recv_cnt     =
+	device->send_cnt     =
+	device->writ_cnt     =
+	device->p_size       =
+	device->rs_start     =
+	device->rs_total     =
+	device->rs_failed    = 0;
+	device->rs_last_events = 0;
+	device->rs_last_sect_ev = 0;
+	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+		device->rs_mark_left[i] = 0;
+		device->rs_mark_time[i] = 0;
+	}
+	D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
+
+	drbd_set_my_capacity(device, 0);
+	if (device->bitmap) {
+		/* maybe never allocated. */
+		drbd_bm_resize(device, 0, 1);
+		drbd_bm_cleanup(device);
+	}
+
+	drbd_free_ldev(device->ldev);
+	device->ldev = NULL;
+
+	clear_bit(AL_SUSPENDED, &device->flags);
+
+	D_ASSERT(device, list_empty(&device->active_ee));
+	D_ASSERT(device, list_empty(&device->sync_ee));
+	D_ASSERT(device, list_empty(&device->done_ee));
+	D_ASSERT(device, list_empty(&device->read_ee));
+	D_ASSERT(device, list_empty(&device->net_ee));
+	D_ASSERT(device, list_empty(&device->resync_reads));
+	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
+	D_ASSERT(device, list_empty(&device->resync_work.list));
+	D_ASSERT(device, list_empty(&device->unplug_work.list));
+
+	drbd_set_defaults(device);
+}
+
+
+static void drbd_destroy_mempools(void)
+{
+	struct page *page;
+
+	while (drbd_pp_pool) {
+		page = drbd_pp_pool;
+		drbd_pp_pool = (struct page *)page_private(page);
+		__free_page(page);
+		drbd_pp_vacant--;
+	}
+
+	/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
+
+	if (drbd_md_io_bio_set)
+		bioset_free(drbd_md_io_bio_set);
+	if (drbd_md_io_page_pool)
+		mempool_destroy(drbd_md_io_page_pool);
+	if (drbd_ee_mempool)
+		mempool_destroy(drbd_ee_mempool);
+	if (drbd_request_mempool)
+		mempool_destroy(drbd_request_mempool);
+	if (drbd_ee_cache)
+		kmem_cache_destroy(drbd_ee_cache);
+	if (drbd_request_cache)
+		kmem_cache_destroy(drbd_request_cache);
+	if (drbd_bm_ext_cache)
+		kmem_cache_destroy(drbd_bm_ext_cache);
+	if (drbd_al_ext_cache)
+		kmem_cache_destroy(drbd_al_ext_cache);
+
+	drbd_md_io_bio_set   = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_ee_mempool      = NULL;
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+
+	return;
+}
+
+static int drbd_create_mempools(void)
+{
+	struct page *page;
+	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
+	int i;
+
+	/* prepare our caches and mempools */
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+	drbd_pp_pool         = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_md_io_bio_set   = NULL;
+
+	/* caches */
+	drbd_request_cache = kmem_cache_create(
+		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+	if (drbd_request_cache == NULL)
+		goto Enomem;
+
+	drbd_ee_cache = kmem_cache_create(
+		"drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
+	if (drbd_ee_cache == NULL)
+		goto Enomem;
+
+	drbd_bm_ext_cache = kmem_cache_create(
+		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+	if (drbd_bm_ext_cache == NULL)
+		goto Enomem;
+
+	drbd_al_ext_cache = kmem_cache_create(
+		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+	if (drbd_al_ext_cache == NULL)
+		goto Enomem;
+
+	/* mempools */
+	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_bio_set == NULL)
+		goto Enomem;
+
+	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_page_pool == NULL)
+		goto Enomem;
+
+	drbd_request_mempool = mempool_create_slab_pool(number,
+		drbd_request_cache);
+	if (drbd_request_mempool == NULL)
+		goto Enomem;
+
+	drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache);
+	if (drbd_ee_mempool == NULL)
+		goto Enomem;
+
+	/* drbd's page pool */
+	spin_lock_init(&drbd_pp_lock);
+
+	for (i = 0; i < number; i++) {
+		page = alloc_page(GFP_HIGHUSER);
+		if (!page)
+			goto Enomem;
+		set_page_private(page, (unsigned long)drbd_pp_pool);
+		drbd_pp_pool = page;
+	}
+	drbd_pp_vacant = number;
+
+	return 0;
+
+Enomem:
+	drbd_destroy_mempools(); /* in case we allocated some */
+	return -ENOMEM;
+}
+
+static void drbd_release_all_peer_reqs(struct drbd_device *device)
+{
+	int rr;
+
+	rr = drbd_free_peer_reqs(device, &device->active_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in active list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->sync_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in sync list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->read_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in read list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->done_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in done list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->net_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking. */
+void drbd_destroy_device(struct kref *kref)
+{
+	struct drbd_device *device = container_of(kref, struct drbd_device, kref);
+	struct drbd_resource *resource = device->resource;
+	struct drbd_peer_device *peer_device, *tmp_peer_device;
+
+	del_timer_sync(&device->request_timer);
+
+	/* paranoia asserts */
+	D_ASSERT(device, device->open_cnt == 0);
+	/* end paranoia asserts */
+
+	/* cleanup stuff that may have been allocated during
+	 * device (re-)configuration or state changes */
+
+	if (device->this_bdev)
+		bdput(device->this_bdev);
+
+	drbd_free_ldev(device->ldev);
+	device->ldev = NULL;
+
+	drbd_release_all_peer_reqs(device);
+
+	lc_destroy(device->act_log);
+	lc_destroy(device->resync);
+
+	kfree(device->p_uuid);
+	/* device->p_uuid = NULL; */
+
+	if (device->bitmap) /* should no longer be there. */
+		drbd_bm_cleanup(device);
+	__free_page(device->md_io.page);
+	put_disk(device->vdisk);
+	blk_cleanup_queue(device->rq_queue);
+	kfree(device->rs_plan_s);
+
+	/* not for_each_connection(connection, resource):
+	 * those may have been cleaned up and disassociated already.
+	 */
+	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+		kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+		kfree(peer_device);
+	}
+	memset(device, 0xfd, sizeof(*device));
+	kfree(device);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	spinlock_t lock;
+	struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+	struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+	LIST_HEAD(writes);
+	struct drbd_request *req, *tmp;
+
+	spin_lock_irq(&retry->lock);
+	list_splice_init(&retry->writes, &writes);
+	spin_unlock_irq(&retry->lock);
+
+	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+		struct drbd_device *device = req->device;
+		struct bio *bio = req->master_bio;
+		unsigned long start_jif = req->start_jif;
+		bool expected;
+
+		expected =
+			expect(atomic_read(&req->completion_ref) == 0) &&
+			expect(req->rq_state & RQ_POSTPONED) &&
+			expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+				(req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+		if (!expected)
+			drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
+				req, atomic_read(&req->completion_ref),
+				req->rq_state);
+
+		/* We still need to put one kref associated with the
+		 * "completion_ref" going zero in the code path that queued it
+		 * here.  The request object may still be referenced by a
+		 * frozen local req->private_bio, in case we force-detached.
+		 */
+		kref_put(&req->kref, drbd_req_destroy);
+
+		/* A single suspended or otherwise blocking device may stall
+		 * all others as well.  Fortunately, this code path is to
+		 * recover from a situation that "should not happen":
+		 * concurrent writes in multi-primary setup.
+		 * In a "normal" lifecycle, this workqueue is supposed to be
+		 * destroyed without ever doing anything.
+		 * If it turns out to be an issue anyways, we can do per
+		 * resource (replication group) or per device (minor) retry
+		 * workqueues instead.
+		 */
+
+		/* We are not just doing generic_make_request(),
+		 * as we want to keep the start_time information. */
+		inc_ap_bio(device);
+		__drbd_make_request(device, bio, start_jif);
+	}
+}
+
+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
+void drbd_restart_request(struct drbd_request *req)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&retry.lock, flags);
+	list_move_tail(&req->tl_requests, &retry.writes);
+	spin_unlock_irqrestore(&retry.lock, flags);
+
+	/* Drop the extra reference that would otherwise
+	 * have been dropped by complete_master_bio.
+	 * do_retry() needs to grab a new one. */
+	dec_ap_bio(req->device);
+
+	queue_work(retry.wq, &retry.worker);
+}
+
+void drbd_destroy_resource(struct kref *kref)
+{
+	struct drbd_resource *resource =
+		container_of(kref, struct drbd_resource, kref);
+
+	idr_destroy(&resource->devices);
+	free_cpumask_var(resource->cpu_mask);
+	kfree(resource->name);
+	memset(resource, 0xf2, sizeof(*resource));
+	kfree(resource);
+}
+
+void drbd_free_resource(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection, *tmp;
+
+	for_each_connection_safe(connection, tmp, resource) {
+		list_del(&connection->connections);
+		drbd_debugfs_connection_cleanup(connection);
+		kref_put(&connection->kref, drbd_destroy_connection);
+	}
+	drbd_debugfs_resource_cleanup(resource);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static void drbd_cleanup(void)
+{
+	unsigned int i;
+	struct drbd_device *device;
+	struct drbd_resource *resource, *tmp;
+
+	/* first remove proc,
+	 * drbdsetup uses it's presence to detect
+	 * whether DRBD is loaded.
+	 * If we would get stuck in proc removal,
+	 * but have netlink already deregistered,
+	 * some drbdsetup commands may wait forever
+	 * for an answer.
+	 */
+	if (drbd_proc)
+		remove_proc_entry("drbd", NULL);
+
+	if (retry.wq)
+		destroy_workqueue(retry.wq);
+
+	drbd_genl_unregister();
+	drbd_debugfs_cleanup();
+
+	idr_for_each_entry(&drbd_devices, device, i)
+		drbd_delete_device(device);
+
+	/* not _rcu since, no other updater anymore. Genl already unregistered */
+	for_each_resource_safe(resource, tmp, &drbd_resources) {
+		list_del(&resource->resources);
+		drbd_free_resource(resource);
+	}
+
+	drbd_destroy_mempools();
+	unregister_blkdev(DRBD_MAJOR, "drbd");
+
+	idr_destroy(&drbd_devices);
+
+	pr_info("module cleanup done.\n");
+}
+
+/**
+ * drbd_congested() - Callback for the flusher thread
+ * @congested_data:	User data
+ * @bdi_bits:		Bits the BDI flusher thread is currently interested in
+ *
+ * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ */
+static int drbd_congested(void *congested_data, int bdi_bits)
+{
+	struct drbd_device *device = congested_data;
+	struct request_queue *q;
+	char reason = '-';
+	int r = 0;
+
+	if (!may_inc_ap_bio(device)) {
+		/* DRBD has frozen IO */
+		r = bdi_bits;
+		reason = 'd';
+		goto out;
+	}
+
+	if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
+		r |= (1 << BDI_async_congested);
+		/* Without good local data, we would need to read from remote,
+		 * and that would need the worker thread as well, which is
+		 * currently blocked waiting for that usermode helper to
+		 * finish.
+		 */
+		if (!get_ldev_if_state(device, D_UP_TO_DATE))
+			r |= (1 << BDI_sync_congested);
+		else
+			put_ldev(device);
+		r &= bdi_bits;
+		reason = 'c';
+		goto out;
+	}
+
+	if (get_ldev(device)) {
+		q = bdev_get_queue(device->ldev->backing_bdev);
+		r = bdi_congested(&q->backing_dev_info, bdi_bits);
+		put_ldev(device);
+		if (r)
+			reason = 'b';
+	}
+
+	if (bdi_bits & (1 << BDI_async_congested) &&
+	    test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
+		r |= (1 << BDI_async_congested);
+		reason = reason == 'b' ? 'a' : 'n';
+	}
+
+out:
+	device->congestion_reason = reason;
+	return r;
+}
+
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
+{
+	spin_lock_init(&wq->q_lock);
+	INIT_LIST_HEAD(&wq->q);
+	init_waitqueue_head(&wq->q_wait);
+}
+
+struct completion_work {
+	struct drbd_work w;
+	struct completion done;
+};
+
+static int w_complete(struct drbd_work *w, int cancel)
+{
+	struct completion_work *completion_work =
+		container_of(w, struct completion_work, w);
+
+	complete(&completion_work->done);
+	return 0;
+}
+
+void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
+{
+	struct completion_work completion_work;
+
+	completion_work.w.cb = w_complete;
+	init_completion(&completion_work.done);
+	drbd_queue_work(work_queue, &completion_work.w);
+	wait_for_completion(&completion_work.done);
+}
+
+struct drbd_resource *drbd_find_resource(const char *name)
+{
+	struct drbd_resource *resource;
+
+	if (!name || !name[0])
+		return NULL;
+
+	rcu_read_lock();
+	for_each_resource_rcu(resource, &drbd_resources) {
+		if (!strcmp(resource->name, name)) {
+			kref_get(&resource->kref);
+			goto found;
+		}
+	}
+	resource = NULL;
+found:
+	rcu_read_unlock();
+	return resource;
+}
+
+struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+				     void *peer_addr, int peer_addr_len)
+{
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+
+	rcu_read_lock();
+	for_each_resource_rcu(resource, &drbd_resources) {
+		for_each_connection_rcu(connection, resource) {
+			if (connection->my_addr_len == my_addr_len &&
+			    connection->peer_addr_len == peer_addr_len &&
+			    !memcmp(&connection->my_addr, my_addr, my_addr_len) &&
+			    !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) {
+				kref_get(&connection->kref);
+				goto found;
+			}
+		}
+	}
+	connection = NULL;
+found:
+	rcu_read_unlock();
+	return connection;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+	socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+	if (!socket->rbuf)
+		return -ENOMEM;
+	socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+	if (!socket->sbuf)
+		return -ENOMEM;
+	return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+	free_page((unsigned long) socket->sbuf);
+	free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_connection *connection)
+{
+	drbd_free_sock(connection);
+
+	crypto_free_hash(connection->csums_tfm);
+	crypto_free_hash(connection->verify_tfm);
+	crypto_free_hash(connection->cram_hmac_tfm);
+	crypto_free_hash(connection->integrity_tfm);
+	crypto_free_hash(connection->peer_integrity_tfm);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+
+	connection->csums_tfm = NULL;
+	connection->verify_tfm = NULL;
+	connection->cram_hmac_tfm = NULL;
+	connection->integrity_tfm = NULL;
+	connection->peer_integrity_tfm = NULL;
+	connection->int_dig_in = NULL;
+	connection->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts)
+{
+	struct drbd_connection *connection;
+	cpumask_var_t new_cpu_mask;
+	int err;
+
+	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	/* silently ignore cpu mask on UP kernel */
+	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+		err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
+				   cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err == -EOVERFLOW) {
+			/* So what. mask it out. */
+			cpumask_var_t tmp_cpu_mask;
+			if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+				cpumask_setall(tmp_cpu_mask);
+				cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+				drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+					res_opts->cpu_mask,
+					strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+					nr_cpu_ids);
+				free_cpumask_var(tmp_cpu_mask);
+				err = 0;
+			}
+		}
+		if (err) {
+			drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
+			/* retcode = ERR_CPU_MASK_PARSE; */
+			goto fail;
+		}
+	}
+	resource->res_opts = *res_opts;
+	if (cpumask_empty(new_cpu_mask))
+		drbd_calc_cpu_mask(&new_cpu_mask);
+	if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) {
+		cpumask_copy(resource->cpu_mask, new_cpu_mask);
+		for_each_connection_rcu(connection, resource) {
+			connection->receiver.reset_cpu_mask = 1;
+			connection->asender.reset_cpu_mask = 1;
+			connection->worker.reset_cpu_mask = 1;
+		}
+	}
+	err = 0;
+
+fail:
+	free_cpumask_var(new_cpu_mask);
+	return err;
+
+}
+
+struct drbd_resource *drbd_create_resource(const char *name)
+{
+	struct drbd_resource *resource;
+
+	resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL);
+	if (!resource)
+		goto fail;
+	resource->name = kstrdup(name, GFP_KERNEL);
+	if (!resource->name)
+		goto fail_free_resource;
+	if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL))
+		goto fail_free_name;
+	kref_init(&resource->kref);
+	idr_init(&resource->devices);
+	INIT_LIST_HEAD(&resource->connections);
+	resource->write_ordering = WO_bdev_flush;
+	list_add_tail_rcu(&resource->resources, &drbd_resources);
+	mutex_init(&resource->conf_update);
+	mutex_init(&resource->adm_mutex);
+	spin_lock_init(&resource->req_lock);
+	drbd_debugfs_resource_add(resource);
+	return resource;
+
+fail_free_name:
+	kfree(resource->name);
+fail_free_resource:
+	kfree(resource);
+fail:
+	return NULL;
+}
+
+/* caller must be under adm_mutex */
+struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
+{
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+
+	connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL);
+	if (!connection)
+		return NULL;
+
+	if (drbd_alloc_socket(&connection->data))
+		goto fail;
+	if (drbd_alloc_socket(&connection->meta))
+		goto fail;
+
+	connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	if (!connection->current_epoch)
+		goto fail;
+
+	INIT_LIST_HEAD(&connection->transfer_log);
+
+	INIT_LIST_HEAD(&connection->current_epoch->list);
+	connection->epochs = 1;
+	spin_lock_init(&connection->epoch_lock);
+
+	connection->send.seen_any_write_yet = false;
+	connection->send.current_epoch_nr = 0;
+	connection->send.current_epoch_writes = 0;
+
+	resource = drbd_create_resource(name);
+	if (!resource)
+		goto fail;
+
+	connection->cstate = C_STANDALONE;
+	mutex_init(&connection->cstate_mutex);
+	init_waitqueue_head(&connection->ping_wait);
+	idr_init(&connection->peer_devices);
+
+	drbd_init_workqueue(&connection->sender_work);
+	mutex_init(&connection->data.mutex);
+	mutex_init(&connection->meta.mutex);
+
+	drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver");
+	connection->receiver.connection = connection;
+	drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
+	connection->worker.connection = connection;
+	drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
+	connection->asender.connection = connection;
+
+	kref_init(&connection->kref);
+
+	connection->resource = resource;
+
+	if (set_resource_options(resource, res_opts))
+		goto fail_resource;
+
+	kref_get(&resource->kref);
+	list_add_tail_rcu(&connection->connections, &resource->connections);
+	drbd_debugfs_connection_add(connection);
+	return connection;
+
+fail_resource:
+	list_del(&resource->resources);
+	drbd_free_resource(resource);
+fail:
+	kfree(connection->current_epoch);
+	drbd_free_socket(&connection->meta);
+	drbd_free_socket(&connection->data);
+	kfree(connection);
+	return NULL;
+}
+
+void drbd_destroy_connection(struct kref *kref)
+{
+	struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref);
+	struct drbd_resource *resource = connection->resource;
+
+	if (atomic_read(&connection->current_epoch->epoch_size) !=  0)
+		drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
+	kfree(connection->current_epoch);
+
+	idr_destroy(&connection->peer_devices);
+
+	drbd_free_socket(&connection->meta);
+	drbd_free_socket(&connection->data);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+	memset(connection, 0xfc, sizeof(*connection));
+	kfree(connection);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static int init_submitter(struct drbd_device *device)
+{
+	/* opencoded create_singlethread_workqueue(),
+	 * to be able to say "drbd%d", ..., minor */
+	device->submit.wq = alloc_workqueue("drbd%u_submit",
+			WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+	if (!device->submit.wq)
+		return -ENOMEM;
+
+	INIT_WORK(&device->submit.worker, do_submit);
+	INIT_LIST_HEAD(&device->submit.writes);
+	return 0;
+}
+
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
+{
+	struct drbd_resource *resource = adm_ctx->resource;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	struct drbd_peer_device *peer_device, *tmp_peer_device;
+	struct gendisk *disk;
+	struct request_queue *q;
+	int id;
+	int vnr = adm_ctx->volume;
+	enum drbd_ret_code err = ERR_NOMEM;
+
+	device = minor_to_device(minor);
+	if (device)
+		return ERR_MINOR_OR_VOLUME_EXISTS;
+
+	/* GFP_KERNEL, we are outside of all write-out paths */
+	device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
+	if (!device)
+		return ERR_NOMEM;
+	kref_init(&device->kref);
+
+	kref_get(&resource->kref);
+	device->resource = resource;
+	device->minor = minor;
+	device->vnr = vnr;
+
+	drbd_init_set_defaults(device);
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		goto out_no_q;
+	device->rq_queue = q;
+	q->queuedata   = device;
+
+	disk = alloc_disk(1);
+	if (!disk)
+		goto out_no_disk;
+	device->vdisk = disk;
+
+	set_disk_ro(disk, true);
+
+	disk->queue = q;
+	disk->major = DRBD_MAJOR;
+	disk->first_minor = minor;
+	disk->fops = &drbd_ops;
+	sprintf(disk->disk_name, "drbd%d", minor);
+	disk->private_data = device;
+
+	device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+	/* we have no partitions. we contain only ourselves. */
+	device->this_bdev->bd_contains = device->this_bdev;
+
+	q->backing_dev_info.congested_fn = drbd_congested;
+	q->backing_dev_info.congested_data = device;
+
+	blk_queue_make_request(q, drbd_make_request);
+	blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+	/* Setting the max_hw_sectors to an odd value of 8kibyte here
+	   This triggers a max_bio_size message upon first attach or connect */
+	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	blk_queue_merge_bvec(q, drbd_merge_bvec);
+	q->queue_lock = &resource->req_lock;
+
+	device->md_io.page = alloc_page(GFP_KERNEL);
+	if (!device->md_io.page)
+		goto out_no_io_page;
+
+	if (drbd_bm_init(device))
+		goto out_no_bitmap;
+	device->read_requests = RB_ROOT;
+	device->write_requests = RB_ROOT;
+
+	id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
+	if (id < 0) {
+		if (id == -ENOSPC)
+			err = ERR_MINOR_OR_VOLUME_EXISTS;
+		goto out_no_minor_idr;
+	}
+	kref_get(&device->kref);
+
+	id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
+	if (id < 0) {
+		if (id == -ENOSPC)
+			err = ERR_MINOR_OR_VOLUME_EXISTS;
+		goto out_idr_remove_minor;
+	}
+	kref_get(&device->kref);
+
+	INIT_LIST_HEAD(&device->peer_devices);
+	INIT_LIST_HEAD(&device->pending_bitmap_io);
+	for_each_connection(connection, resource) {
+		peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
+		if (!peer_device)
+			goto out_idr_remove_from_resource;
+		peer_device->connection = connection;
+		peer_device->device = device;
+
+		list_add(&peer_device->peer_devices, &device->peer_devices);
+		kref_get(&device->kref);
+
+		id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
+		if (id < 0) {
+			if (id == -ENOSPC)
+				err = ERR_INVALID_REQUEST;
+			goto out_idr_remove_from_resource;
+		}
+		kref_get(&connection->kref);
+	}
+
+	if (init_submitter(device)) {
+		err = ERR_NOMEM;
+		goto out_idr_remove_vol;
+	}
+
+	add_disk(disk);
+
+	/* inherit the connection state */
+	device->state.conn = first_connection(resource)->cstate;
+	if (device->state.conn == C_WF_REPORT_PARAMS) {
+		for_each_peer_device(peer_device, device)
+			drbd_connected(peer_device);
+	}
+	/* move to create_peer_device() */
+	for_each_peer_device(peer_device, device)
+		drbd_debugfs_peer_device_add(peer_device);
+	drbd_debugfs_device_add(device);
+	return NO_ERROR;
+
+out_idr_remove_vol:
+	idr_remove(&connection->peer_devices, vnr);
+out_idr_remove_from_resource:
+	for_each_connection(connection, resource) {
+		peer_device = idr_find(&connection->peer_devices, vnr);
+		if (peer_device) {
+			idr_remove(&connection->peer_devices, vnr);
+			kref_put(&connection->kref, drbd_destroy_connection);
+		}
+	}
+	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+		list_del(&peer_device->peer_devices);
+		kfree(peer_device);
+	}
+	idr_remove(&resource->devices, vnr);
+out_idr_remove_minor:
+	idr_remove(&drbd_devices, minor);
+	synchronize_rcu();
+out_no_minor_idr:
+	drbd_bm_cleanup(device);
+out_no_bitmap:
+	__free_page(device->md_io.page);
+out_no_io_page:
+	put_disk(disk);
+out_no_disk:
+	blk_cleanup_queue(q);
+out_no_q:
+	kref_put(&resource->kref, drbd_destroy_resource);
+	kfree(device);
+	return err;
+}
+
+void drbd_delete_device(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+	struct drbd_connection *connection;
+	struct drbd_peer_device *peer_device;
+	int refs = 3;
+
+	/* move to free_peer_device() */
+	for_each_peer_device(peer_device, device)
+		drbd_debugfs_peer_device_cleanup(peer_device);
+	drbd_debugfs_device_cleanup(device);
+	for_each_connection(connection, resource) {
+		idr_remove(&connection->peer_devices, device->vnr);
+		refs++;
+	}
+	idr_remove(&resource->devices, device->vnr);
+	idr_remove(&drbd_devices, device_to_minor(device));
+	del_gendisk(device->vdisk);
+	synchronize_rcu();
+	kref_sub(&device->kref, refs, drbd_destroy_device);
+}
+
+static int __init drbd_init(void)
+{
+	int err;
+
+	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
+		pr_err("invalid minor_count (%d)\n", minor_count);
+#ifdef MODULE
+		return -EINVAL;
+#else
+		minor_count = DRBD_MINOR_COUNT_DEF;
+#endif
+	}
+
+	err = register_blkdev(DRBD_MAJOR, "drbd");
+	if (err) {
+		pr_err("unable to register block device major %d\n",
+		       DRBD_MAJOR);
+		return err;
+	}
+
+	/*
+	 * allocate all necessary structs
+	 */
+	init_waitqueue_head(&drbd_pp_wait);
+
+	drbd_proc = NULL; /* play safe for drbd_cleanup */
+	idr_init(&drbd_devices);
+
+	rwlock_init(&global_state_lock);
+	INIT_LIST_HEAD(&drbd_resources);
+
+	err = drbd_genl_register();
+	if (err) {
+		pr_err("unable to register generic netlink family\n");
+		goto fail;
+	}
+
+	err = drbd_create_mempools();
+	if (err)
+		goto fail;
+
+	err = -ENOMEM;
+	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
+	if (!drbd_proc)	{
+		pr_err("unable to register proc file\n");
+		goto fail;
+	}
+
+	retry.wq = create_singlethread_workqueue("drbd-reissue");
+	if (!retry.wq) {
+		pr_err("unable to create retry workqueue\n");
+		goto fail;
+	}
+	INIT_WORK(&retry.worker, do_retry);
+	spin_lock_init(&retry.lock);
+	INIT_LIST_HEAD(&retry.writes);
+
+	if (drbd_debugfs_init())
+		pr_notice("failed to initialize debugfs -- will not be available\n");
+
+	pr_info("initialized. "
+	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+	pr_info("%s\n", drbd_buildtag());
+	pr_info("registered as block device major %d\n", DRBD_MAJOR);
+	return 0; /* Success! */
+
+fail:
+	drbd_cleanup();
+	if (err == -ENOMEM)
+		pr_err("ran out of memory\n");
+	else
+		pr_err("initialization failure\n");
+	return err;
+}
+
+void drbd_free_ldev(struct drbd_backing_dev *ldev)
+{
+	if (ldev == NULL)
+		return;
+
+	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+	kfree(ldev->disk_conf);
+	kfree(ldev);
+}
+
+static void drbd_free_one_sock(struct drbd_socket *ds)
+{
+	struct socket *s;
+	mutex_lock(&ds->mutex);
+	s = ds->socket;
+	ds->socket = NULL;
+	mutex_unlock(&ds->mutex);
+	if (s) {
+		/* so debugfs does not need to mutex_lock() */
+		synchronize_rcu();
+		kernel_sock_shutdown(s, SHUT_RDWR);
+		sock_release(s);
+	}
+}
+
+void drbd_free_sock(struct drbd_connection *connection)
+{
+	if (connection->data.socket)
+		drbd_free_one_sock(&connection->data);
+	if (connection->meta.socket)
+		drbd_free_one_sock(&connection->meta);
+}
+
+/* meta data management */
+
+void conn_md_sync(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_md_sync(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+/* aligned 4kByte */
+struct meta_data_on_disk {
+	u64 la_size_sect;      /* last agreed size. */
+	u64 uuid[UI_SIZE];   /* UUIDs. */
+	u64 device_uuid;
+	u64 reserved_u64_1;
+	u32 flags;             /* MDF */
+	u32 magic;
+	u32 md_size_sect;
+	u32 al_offset;         /* offset to this block */
+	u32 al_nr_extents;     /* important for restoring the AL (userspace) */
+	      /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
+	u32 bm_offset;         /* offset to the bitmap, from here */
+	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
+	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
+
+	/* see al_tr_number_to_on_disk_sector() */
+	u32 al_stripes;
+	u32 al_stripe_size_4k;
+
+	u8 reserved_u8[4096 - (7*8 + 10*4)];
+} __packed;
+
+
+
+void drbd_md_write(struct drbd_device *device, void *b)
+{
+	struct meta_data_on_disk *buffer = b;
+	sector_t sector;
+	int i;
+
+	memset(buffer, 0, sizeof(*buffer));
+
+	buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev));
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+	buffer->flags = cpu_to_be32(device->ldev->md.flags);
+	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
+
+	buffer->md_size_sect  = cpu_to_be32(device->ldev->md.md_size_sect);
+	buffer->al_offset     = cpu_to_be32(device->ldev->md.al_offset);
+	buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
+	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+	buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
+
+	buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
+	buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
+
+	buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
+	buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
+
+	D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
+	sector = device->ldev->md.md_offset;
+
+	if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+		/* this was a try anyways ... */
+		drbd_err(device, "meta data update failed!\n");
+		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+	}
+}
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @device:	DRBD device.
+ */
+void drbd_md_sync(struct drbd_device *device)
+{
+	struct meta_data_on_disk *buffer;
+
+	/* Don't accidentally change the DRBD meta data layout. */
+	BUILD_BUG_ON(UI_SIZE != 4);
+	BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
+	del_timer(&device->md_sync_timer);
+	/* timer may be rearmed by drbd_md_mark_dirty() now. */
+	if (!test_and_clear_bit(MD_DIRTY, &device->flags))
+		return;
+
+	/* We use here D_FAILED and not D_ATTACHING because we try to write
+	 * metadata even if we detach due to a disk failure! */
+	if (!get_ldev_if_state(device, D_FAILED))
+		return;
+
+	buffer = drbd_md_get_buffer(device, __func__);
+	if (!buffer)
+		goto out;
+
+	drbd_md_write(device, buffer);
+
+	/* Update device->ldev->md.la_size_sect,
+	 * since we updated it on metadata. */
+	device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev);
+
+	drbd_md_put_buffer(device);
+out:
+	put_ldev(device);
+}
+
+static int check_activity_log_stripe_size(struct drbd_device *device,
+		struct meta_data_on_disk *on_disk,
+		struct drbd_md *in_core)
+{
+	u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+	u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+	u64 al_size_4k;
+
+	/* both not set: default to old fixed size activity log */
+	if (al_stripes == 0 && al_stripe_size_4k == 0) {
+		al_stripes = 1;
+		al_stripe_size_4k = MD_32kB_SECT/8;
+	}
+
+	/* some paranoia plausibility checks */
+
+	/* we need both values to be set */
+	if (al_stripes == 0 || al_stripe_size_4k == 0)
+		goto err;
+
+	al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+	/* Upper limit of activity log area, to avoid potential overflow
+	 * problems in al_tr_number_to_on_disk_sector(). As right now, more
+	 * than 72 * 4k blocks total only increases the amount of history,
+	 * limiting this arbitrarily to 16 GB is not a real limitation ;-)  */
+	if (al_size_4k > (16 * 1024 * 1024/4))
+		goto err;
+
+	/* Lower limit: we need at least 8 transaction slots (32kB)
+	 * to not break existing setups */
+	if (al_size_4k < MD_32kB_SECT/8)
+		goto err;
+
+	in_core->al_stripe_size_4k = al_stripe_size_4k;
+	in_core->al_stripes = al_stripes;
+	in_core->al_size_4k = al_size_4k;
+
+	return 0;
+err:
+	drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+			al_stripes, al_stripe_size_4k);
+	return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+	sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+	struct drbd_md *in_core = &bdev->md;
+	s32 on_disk_al_sect;
+	s32 on_disk_bm_sect;
+
+	/* The on-disk size of the activity log, calculated from offsets, and
+	 * the size of the activity log calculated from the stripe settings,
+	 * should match.
+	 * Though we could relax this a bit: it is ok, if the striped activity log
+	 * fits in the available on-disk activity log size.
+	 * Right now, that would break how resize is implemented.
+	 * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+	 * of possible unused padding space in the on disk layout. */
+	if (in_core->al_offset < 0) {
+		if (in_core->bm_offset > in_core->al_offset)
+			goto err;
+		on_disk_al_sect = -in_core->al_offset;
+		on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+	} else {
+		if (in_core->al_offset != MD_4kB_SECT)
+			goto err;
+		if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+			goto err;
+
+		on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+		on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+	}
+
+	/* old fixed size meta data is exactly that: fixed. */
+	if (in_core->meta_dev_idx >= 0) {
+		if (in_core->md_size_sect != MD_128MB_SECT
+		||  in_core->al_offset != MD_4kB_SECT
+		||  in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+		||  in_core->al_stripes != 1
+		||  in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+			goto err;
+	}
+
+	if (capacity < in_core->md_size_sect)
+		goto err;
+	if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+		goto err;
+
+	/* should be aligned, and at least 32k */
+	if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+		goto err;
+
+	/* should fit (for now: exactly) into the available on-disk space;
+	 * overflow prevention is in check_activity_log_stripe_size() above. */
+	if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+		goto err;
+
+	/* again, should be aligned */
+	if (in_core->bm_offset & 7)
+		goto err;
+
+	/* FIXME check for device grow with flex external meta data? */
+
+	/* can the available bitmap space cover the last agreed device size? */
+	if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+		goto err;
+
+	return 0;
+
+err:
+	drbd_err(device, "meta data offsets don't make sense: idx=%d "
+			"al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+			"md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+			in_core->meta_dev_idx,
+			in_core->al_stripes, in_core->al_stripe_size_4k,
+			in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+			(unsigned long long)in_core->la_size_sect,
+			(unsigned long long)capacity);
+
+	return -EINVAL;
+}
+
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @device:	DRBD device.
+ * @bdev:	Device from which the meta data should be read in.
+ *
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
+ * something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @device->ldev.
+ */
+int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+	struct meta_data_on_disk *buffer;
+	u32 magic, flags;
+	int i, rv = NO_ERROR;
+
+	if (device->state.disk != D_DISKLESS)
+		return ERR_DISK_CONFIGURED;
+
+	buffer = drbd_md_get_buffer(device, __func__);
+	if (!buffer)
+		return ERR_NOMEM;
+
+	/* First, figure out where our meta data superblock is located,
+	 * and read it. */
+	bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+	bdev->md.md_offset = drbd_md_ss(bdev);
+
+	if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
+		/* NOTE: can't do normal error processing here as this is
+		   called BEFORE disk is attached */
+		drbd_err(device, "Error while reading metadata.\n");
+		rv = ERR_IO_MD_DISK;
+		goto err;
+	}
+
+	magic = be32_to_cpu(buffer->magic);
+	flags = be32_to_cpu(buffer->flags);
+	if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+	    (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+			/* btw: that's Activity Log clean, not "all" clean. */
+		drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+		rv = ERR_MD_UNCLEAN;
+		goto err;
+	}
+
+	rv = ERR_MD_INVALID;
+	if (magic != DRBD_MD_MAGIC_08) {
+		if (magic == DRBD_MD_MAGIC_07)
+			drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+		else
+			drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+		drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+		goto err;
+	}
+
+
+	/* convert to in_core endian */
+	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+	bdev->md.flags = be32_to_cpu(buffer->flags);
+	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+	bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+	bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+	bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
+
+	if (check_activity_log_stripe_size(device, buffer, &bdev->md))
+		goto err;
+	if (check_offsets_and_sizes(device, bdev))
+		goto err;
+
+	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+		drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+		goto err;
+	}
+	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+		drbd_err(device, "unexpected md_size: %u (expected %u)\n",
+		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+		goto err;
+	}
+
+	rv = NO_ERROR;
+
+	spin_lock_irq(&device->resource->req_lock);
+	if (device->state.conn < C_CONNECTED) {
+		unsigned int peer;
+		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
+		peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
+		device->peer_max_bio_size = peer;
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+
+ err:
+	drbd_md_put_buffer(device);
+
+	return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @device:	DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+#ifdef DEBUG
+void drbd_md_mark_dirty_(struct drbd_device *device, unsigned int line, const char *func)
+{
+	if (!test_and_set_bit(MD_DIRTY, &device->flags)) {
+		mod_timer(&device->md_sync_timer, jiffies + HZ);
+		device->last_md_mark_dirty.line = line;
+		device->last_md_mark_dirty.func = func;
+	}
+}
+#else
+void drbd_md_mark_dirty(struct drbd_device *device)
+{
+	if (!test_and_set_bit(MD_DIRTY, &device->flags))
+		mod_timer(&device->md_sync_timer, jiffies + 5*HZ);
+}
+#endif
+
+void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
+{
+	int i;
+
+	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+		device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i];
+}
+
+void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	if (idx == UI_CURRENT) {
+		if (device->state.role == R_PRIMARY)
+			val |= 1;
+		else
+			val &= ~((u64)1);
+
+		drbd_set_ed_uuid(device, val);
+	}
+
+	device->ldev->md.uuid[idx] = val;
+	drbd_md_mark_dirty(device);
+}
+
+void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	__drbd_uuid_set(device, idx, val);
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	if (device->ldev->md.uuid[idx]) {
+		drbd_uuid_move_history(device);
+		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
+	}
+	__drbd_uuid_set(device, idx, val);
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @device:	DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
+{
+	u64 val;
+	unsigned long long bm_uuid;
+
+	get_random_bytes(&val, sizeof(u64));
+
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+
+	if (bm_uuid)
+		drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+	device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
+	__drbd_uuid_set(device, UI_CURRENT, val);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	drbd_print_uuids(device, "new current UUID");
+	/* get it to stable storage _now_ */
+	drbd_md_sync(device);
+}
+
+void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+		return;
+
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	if (val == 0) {
+		drbd_uuid_move_history(device);
+		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+		device->ldev->md.uuid[UI_BITMAP] = 0;
+	} else {
+		unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+		if (bm_uuid)
+			drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+		device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
+	}
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+
+	drbd_md_mark_dirty(device);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device:	DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
+{
+	int rv = -EIO;
+
+	drbd_md_set_flag(device, MDF_FULL_SYNC);
+	drbd_md_sync(device);
+	drbd_bm_set_all(device);
+
+	rv = drbd_bm_write(device);
+
+	if (!rv) {
+		drbd_md_clear_flag(device, MDF_FULL_SYNC);
+		drbd_md_sync(device);
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device:	DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
+{
+	drbd_resume_al(device);
+	drbd_bm_clear_all(device);
+	return drbd_bm_write(device);
+}
+
+static int w_bitmap_io(struct drbd_work *w, int unused)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, bm_io_work.w);
+	struct bm_io_work *work = &device->bm_io_work;
+	int rv = -EIO;
+
+	D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0);
+
+	if (get_ldev(device)) {
+		drbd_bm_lock(device, work->why, work->flags);
+		rv = work->io_fn(device);
+		drbd_bm_unlock(device);
+		put_ldev(device);
+	}
+
+	clear_bit_unlock(BITMAP_IO, &device->flags);
+	wake_up(&device->misc_wait);
+
+	if (work->done)
+		work->done(device, rv);
+
+	clear_bit(BITMAP_IO_QUEUED, &device->flags);
+	work->why = NULL;
+	work->flags = 0;
+
+	return 0;
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @device:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @done:	callback to be called after the bitmap IO was performed
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ *
+ * Its worker function encloses the call of io_fn() by get_ldev() and
+ * put_ldev().
+ */
+void drbd_queue_bitmap_io(struct drbd_device *device,
+			  int (*io_fn)(struct drbd_device *),
+			  void (*done)(struct drbd_device *, int),
+			  char *why, enum bm_flag flags)
+{
+	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+	D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
+	D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
+	D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
+	if (device->bm_io_work.why)
+		drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
+			why, device->bm_io_work.why);
+
+	device->bm_io_work.io_fn = io_fn;
+	device->bm_io_work.done = done;
+	device->bm_io_work.why = why;
+	device->bm_io_work.flags = flags;
+
+	spin_lock_irq(&device->resource->req_lock);
+	set_bit(BITMAP_IO, &device->flags);
+	if (atomic_read(&device->ap_bio_cnt) == 0) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+					&device->bm_io_work.w);
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+}
+
+/**
+ * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
+ * @device:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+
+	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+		drbd_suspend_io(device);
+
+	drbd_bm_lock(device, why, flags);
+	rv = io_fn(device);
+	drbd_bm_unlock(device);
+
+	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+		drbd_resume_io(device);
+
+	return rv;
+}
+
+void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+	if ((device->ldev->md.flags & flag) != flag) {
+		drbd_md_mark_dirty(device);
+		device->ldev->md.flags |= flag;
+	}
+}
+
+void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+	if ((device->ldev->md.flags & flag) != 0) {
+		drbd_md_mark_dirty(device);
+		device->ldev->md.flags &= ~flag;
+	}
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+	return (bdev->md.flags & flag) != 0;
+}
+
+static void md_sync_timer_fn(unsigned long data)
+{
+	struct drbd_device *device = (struct drbd_device *) data;
+	drbd_device_post_work(device, MD_SYNC);
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+	/* THINK may need to become several global tables
+	 * when we want to support more than
+	 * one PRO_VERSION */
+	static const char *cmdnames[] = {
+		[P_DATA]	        = "Data",
+		[P_DATA_REPLY]	        = "DataReply",
+		[P_RS_DATA_REPLY]	= "RSDataReply",
+		[P_BARRIER]	        = "Barrier",
+		[P_BITMAP]	        = "ReportBitMap",
+		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
+		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
+		[P_UNPLUG_REMOTE]	= "UnplugRemote",
+		[P_DATA_REQUEST]	= "DataRequest",
+		[P_RS_DATA_REQUEST]     = "RSDataRequest",
+		[P_SYNC_PARAM]	        = "SyncParam",
+		[P_SYNC_PARAM89]	= "SyncParam89",
+		[P_PROTOCOL]            = "ReportProtocol",
+		[P_UUIDS]	        = "ReportUUIDs",
+		[P_SIZES]	        = "ReportSizes",
+		[P_STATE]	        = "ReportState",
+		[P_SYNC_UUID]           = "ReportSyncUUID",
+		[P_AUTH_CHALLENGE]      = "AuthChallenge",
+		[P_AUTH_RESPONSE]	= "AuthResponse",
+		[P_PING]		= "Ping",
+		[P_PING_ACK]	        = "PingAck",
+		[P_RECV_ACK]	        = "RecvAck",
+		[P_WRITE_ACK]	        = "WriteAck",
+		[P_RS_WRITE_ACK]	= "RSWriteAck",
+		[P_SUPERSEDED]          = "Superseded",
+		[P_NEG_ACK]	        = "NegAck",
+		[P_NEG_DREPLY]	        = "NegDReply",
+		[P_NEG_RS_DREPLY]	= "NegRSDReply",
+		[P_BARRIER_ACK]	        = "BarrierAck",
+		[P_STATE_CHG_REQ]       = "StateChgRequest",
+		[P_STATE_CHG_REPLY]     = "StateChgReply",
+		[P_OV_REQUEST]          = "OVRequest",
+		[P_OV_REPLY]            = "OVReply",
+		[P_OV_RESULT]           = "OVResult",
+		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
+		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
+		[P_COMPRESSED_BITMAP]   = "CBitmap",
+		[P_DELAY_PROBE]         = "DelayProbe",
+		[P_OUT_OF_SYNC]		= "OutOfSync",
+		[P_RETRY_WRITE]		= "RetryWrite",
+		[P_RS_CANCEL]		= "RSCancel",
+		[P_CONN_ST_CHG_REQ]	= "conn_st_chg_req",
+		[P_CONN_ST_CHG_REPLY]	= "conn_st_chg_reply",
+		[P_RETRY_WRITE]		= "retry_write",
+		[P_PROTOCOL_UPDATE]	= "protocol_update",
+
+		/* enum drbd_packet, but not commands - obsoleted flags:
+		 *	P_MAY_IGNORE
+		 *	P_MAX_OPT_CMD
+		 */
+	};
+
+	/* too big for the array: 0xfffX */
+	if (cmd == P_INITIAL_META)
+		return "InitialMeta";
+	if (cmd == P_INITIAL_DATA)
+		return "InitialData";
+	if (cmd == P_CONNECTION_FEATURES)
+		return "ConnectionFeatures";
+	if (cmd >= ARRAY_SIZE(cmdnames))
+		return "Unknown";
+	return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc  -  wait for a request to make progress
+ * @device:	device associated with the request
+ * @i:		the struct drbd_interval embedded in struct drbd_request or
+ *		struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
+{
+	struct net_conf *nc;
+	DEFINE_WAIT(wait);
+	long timeout;
+
+	rcu_read_lock();
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -ETIMEDOUT;
+	}
+	timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+	rcu_read_unlock();
+
+	/* Indicate to wake up device->misc_wait on progress.  */
+	i->waiting = true;
+	prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&device->resource->req_lock);
+	timeout = schedule_timeout(timeout);
+	finish_wait(&device->misc_wait, &wait);
+	spin_lock_irq(&device->resource->req_lock);
+	if (!timeout || device->state.conn < C_CONNECTED)
+		return -ETIMEDOUT;
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	return 0;
+}
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+	unsigned long state;
+	unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801  /* prime */
+#define FAULT_RANDOM_ADD	479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+	long refresh;
+
+	if (!rsp->count--) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rsp->state += refresh;
+		rsp->count = FAULT_RANDOM_REFRESH;
+	}
+	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+	return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+	static char *_faults[] = {
+		[DRBD_FAULT_MD_WR] = "Meta-data write",
+		[DRBD_FAULT_MD_RD] = "Meta-data read",
+		[DRBD_FAULT_RS_WR] = "Resync write",
+		[DRBD_FAULT_RS_RD] = "Resync read",
+		[DRBD_FAULT_DT_WR] = "Data write",
+		[DRBD_FAULT_DT_RD] = "Data read",
+		[DRBD_FAULT_DT_RA] = "Data read ahead",
+		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
+		[DRBD_FAULT_AL_EE] = "EE allocation",
+		[DRBD_FAULT_RECEIVE] = "receive data corruption",
+	};
+
+	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type)
+{
+	static struct fault_random_state rrs = {0, 0};
+
+	unsigned int ret = (
+		(fault_devs == 0 ||
+			((1 << device_to_minor(device)) & fault_devs) != 0) &&
+		(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
+
+	if (ret) {
+		fault_count++;
+
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_warn(device, "***Simulating %s failure\n",
+				_drbd_fault_str(type));
+	}
+
+	return ret;
+}
+#endif
+
+const char *drbd_buildtag(void)
+{
+	/* DRBD built from external sources has here a reference to the
+	   git hash of the source code. */
+
+	static char buildtag[38] = "\0uilt-in";
+
+	if (buildtag[0] == 0) {
+#ifdef MODULE
+		sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+		buildtag[0] = 'b';
+#endif
+	}
+
+	return buildtag;
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000..74df8cfad
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,3674 @@
+/*
+   drbd_nl.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_limits.h>
+#include <linux/kthread.h>
+
+#include <net/genetlink.h>
+
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+	genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+	if (genlmsg_reply(skb, info))
+		pr_err("error sending genl reply\n");
+}
+
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+static int drbd_msg_put_info(struct sk_buff *skb, const char *info)
+{
+	struct nlattr *nla;
+	int err = -EMSGSIZE;
+
+	if (!info || !info[0])
+		return 0;
+
+	nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+	if (!nla)
+		return err;
+
+	err = nla_put_string(skb, T_info_text, info);
+	if (err) {
+		nla_nest_cancel(skb, nla);
+		return err;
+	} else
+		nla_nest_end(skb, nla);
+	return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ *
+ * At this point, we still rely on the global genl_lock().
+ * If we want to avoid that, and allow "genl_family.parallel_ops", we may need
+ * to add additional synchronization against object destruction/modification.
+ */
+#define DRBD_ADM_NEED_MINOR	1
+#define DRBD_ADM_NEED_RESOURCE	2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
+	struct sk_buff *skb, struct genl_info *info, unsigned flags)
+{
+	struct drbd_genlmsghdr *d_in = info->userhdr;
+	const u8 cmd = info->genlhdr->cmd;
+	int err;
+
+	memset(adm_ctx, 0, sizeof(*adm_ctx));
+
+	/* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+	if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+	       return -EPERM;
+
+	adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!adm_ctx->reply_skb) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
+					info, &drbd_genl_family, 0, cmd);
+	/* put of a few bytes into a fresh skb of >= 4k will always succeed.
+	 * but anyways */
+	if (!adm_ctx->reply_dh) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	adm_ctx->reply_dh->minor = d_in->minor;
+	adm_ctx->reply_dh->ret_code = NO_ERROR;
+
+	adm_ctx->volume = VOLUME_UNSPECIFIED;
+	if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+		struct nlattr *nla;
+		/* parse and validate only */
+		err = drbd_cfg_context_from_attrs(NULL, info);
+		if (err)
+			goto fail;
+
+		/* It was present, and valid,
+		 * copy it over to the reply skb. */
+		err = nla_put_nohdr(adm_ctx->reply_skb,
+				info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+				info->attrs[DRBD_NLA_CFG_CONTEXT]);
+		if (err)
+			goto fail;
+
+		/* and assign stuff to the adm_ctx */
+		nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+		if (nla)
+			adm_ctx->volume = nla_get_u32(nla);
+		nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+		if (nla)
+			adm_ctx->resource_name = nla_data(nla);
+		adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+		adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+		if ((adm_ctx->my_addr &&
+		     nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
+		    (adm_ctx->peer_addr &&
+		     nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
+			err = -EINVAL;
+			goto fail;
+		}
+	}
+
+	adm_ctx->minor = d_in->minor;
+	adm_ctx->device = minor_to_device(d_in->minor);
+
+	/* We are protected by the global genl_lock().
+	 * But we may explicitly drop it/retake it in drbd_adm_set_role(),
+	 * so make sure this object stays around. */
+	if (adm_ctx->device)
+		kref_get(&adm_ctx->device->kref);
+
+	if (adm_ctx->resource_name) {
+		adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
+	}
+
+	if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor");
+		return ERR_MINOR_INVALID;
+	}
+	if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource");
+		if (adm_ctx->resource_name)
+			return ERR_RES_NOT_KNOWN;
+		return ERR_INVALID_REQUEST;
+	}
+
+	if (flags & DRBD_ADM_NEED_CONNECTION) {
+		if (adm_ctx->resource) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected");
+			return ERR_INVALID_REQUEST;
+		}
+		if (adm_ctx->device) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected");
+			return ERR_INVALID_REQUEST;
+		}
+		if (adm_ctx->my_addr && adm_ctx->peer_addr)
+			adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
+							  nla_len(adm_ctx->my_addr),
+							  nla_data(adm_ctx->peer_addr),
+							  nla_len(adm_ctx->peer_addr));
+		if (!adm_ctx->connection) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection");
+			return ERR_INVALID_REQUEST;
+		}
+	}
+
+	/* some more paranoia, if the request was over-determined */
+	if (adm_ctx->device && adm_ctx->resource &&
+	    adm_ctx->device->resource != adm_ctx->resource) {
+		pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
+				adm_ctx->minor, adm_ctx->resource->name,
+				adm_ctx->device->resource->name);
+		drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource");
+		return ERR_INVALID_REQUEST;
+	}
+	if (adm_ctx->device &&
+	    adm_ctx->volume != VOLUME_UNSPECIFIED &&
+	    adm_ctx->volume != adm_ctx->device->vnr) {
+		pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+				adm_ctx->minor, adm_ctx->volume,
+				adm_ctx->device->vnr,
+				adm_ctx->device->resource->name);
+		drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume");
+		return ERR_INVALID_REQUEST;
+	}
+
+	/* still, provide adm_ctx->resource always, if possible. */
+	if (!adm_ctx->resource) {
+		adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
+			: adm_ctx->connection ? adm_ctx->connection->resource : NULL;
+		if (adm_ctx->resource)
+			kref_get(&adm_ctx->resource->kref);
+	}
+
+	return NO_ERROR;
+
+fail:
+	nlmsg_free(adm_ctx->reply_skb);
+	adm_ctx->reply_skb = NULL;
+	return err;
+}
+
+static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
+	struct genl_info *info, int retcode)
+{
+	if (adm_ctx->device) {
+		kref_put(&adm_ctx->device->kref, drbd_destroy_device);
+		adm_ctx->device = NULL;
+	}
+	if (adm_ctx->connection) {
+		kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
+		adm_ctx->connection = NULL;
+	}
+	if (adm_ctx->resource) {
+		kref_put(&adm_ctx->resource->kref, drbd_destroy_resource);
+		adm_ctx->resource = NULL;
+	}
+
+	if (!adm_ctx->reply_skb)
+		return -ENOMEM;
+
+	adm_ctx->reply_dh->ret_code = retcode;
+	drbd_adm_send_reply(adm_ctx->reply_skb, info);
+	return 0;
+}
+
+static void setup_khelper_env(struct drbd_connection *connection, char **envp)
+{
+	char *afs;
+
+	/* FIXME: A future version will not allow this case. */
+	if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
+		return;
+
+	switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
+	case AF_INET6:
+		afs = "ipv6";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+			 &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
+		break;
+	case AF_INET:
+		afs = "ipv4";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+			 &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+		break;
+	default:
+		afs = "ssocks";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+			 &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+	}
+	snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
+
+int drbd_khelper(struct drbd_device *device, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			 (char[20]) { }, /* address family */
+			 (char[60]) { }, /* address */
+			NULL };
+	char mb[12];
+	char *argv[] = {usermode_helper, cmd, mb, NULL };
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct sib_info sib;
+	int ret;
+
+	if (current == connection->worker.task)
+		set_bit(CALLBACK_PENDING, &connection->flags);
+
+	snprintf(mb, 12, "minor-%d", device_to_minor(device));
+	setup_khelper_env(connection, envp);
+
+	/* The helper may take some time.
+	 * write out any unsynced meta data changes now */
+	drbd_md_sync(device);
+
+	drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+	sib.sib_reason = SIB_HELPER_PRE;
+	sib.helper_name = cmd;
+	drbd_bcast_event(device, &sib);
+	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+	if (ret)
+		drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	else
+		drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	sib.sib_reason = SIB_HELPER_POST;
+	sib.helper_exit_code = ret;
+	drbd_bcast_event(device, &sib);
+
+	if (current == connection->worker.task)
+		clear_bit(CALLBACK_PENDING, &connection->flags);
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+static int conn_khelper(struct drbd_connection *connection, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			 (char[20]) { }, /* address family */
+			 (char[60]) { }, /* address */
+			NULL };
+	char *resource_name = connection->resource->name;
+	char *argv[] = {usermode_helper, cmd, resource_name, NULL };
+	int ret;
+
+	setup_khelper_env(connection, envp);
+	conn_md_sync(connection);
+
+	drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
+	/* TODO: conn_bcast_event() ?? */
+
+	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+	if (ret)
+		drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+			  usermode_helper, cmd, resource_name,
+			  (ret >> 8) & 0xff, ret);
+	else
+		drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+			  usermode_helper, cmd, resource_name,
+			  (ret >> 8) & 0xff, ret);
+	/* TODO: conn_bcast_event() ?? */
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
+{
+	enum drbd_fencing_p fp = FP_NOT_AVAIL;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (get_ldev_if_state(device, D_CONSISTENT)) {
+			struct disk_conf *disk_conf =
+				rcu_dereference(peer_device->device->ldev->disk_conf);
+			fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing);
+			put_ldev(device);
+		}
+	}
+	rcu_read_unlock();
+
+	if (fp == FP_NOT_AVAIL) {
+		/* IO Suspending works on the whole resource.
+		   Do it only for one device. */
+		vnr = 0;
+		peer_device = idr_get_next(&connection->peer_devices, &vnr);
+		drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
+	}
+
+	return fp;
+}
+
+bool conn_try_outdate_peer(struct drbd_connection *connection)
+{
+	unsigned int connect_cnt;
+	union drbd_state mask = { };
+	union drbd_state val = { };
+	enum drbd_fencing_p fp;
+	char *ex_to_string;
+	int r;
+
+	spin_lock_irq(&connection->resource->req_lock);
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
+		spin_unlock_irq(&connection->resource->req_lock);
+		return false;
+	}
+
+	connect_cnt = connection->connect_cnt;
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	fp = highest_fencing_policy(connection);
+	switch (fp) {
+	case FP_NOT_AVAIL:
+		drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
+		goto out;
+	case FP_DONT_CARE:
+		return true;
+	default: ;
+	}
+
+	r = conn_khelper(connection, "fence-peer");
+
+	switch ((r>>8) & 0xff) {
+	case 3: /* peer is inconsistent */
+		ex_to_string = "peer is inconsistent or worse";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_INCONSISTENT;
+		break;
+	case 4: /* peer got outdated, or was already outdated */
+		ex_to_string = "peer was fenced";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_OUTDATED;
+		break;
+	case 5: /* peer was down */
+		if (conn_highest_disk(connection) == D_UP_TO_DATE) {
+			/* we will(have) create(d) a new UUID anyways... */
+			ex_to_string = "peer is unreachable, assumed to be dead";
+			mask.pdsk = D_MASK;
+			val.pdsk = D_OUTDATED;
+		} else {
+			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+		}
+		break;
+	case 6: /* Peer is primary, voluntarily outdate myself.
+		 * This is useful when an unconnected R_SECONDARY is asked to
+		 * become R_PRIMARY, but finds the other peer being active. */
+		ex_to_string = "peer is active";
+		drbd_warn(connection, "Peer is primary, outdating myself.\n");
+		mask.disk = D_MASK;
+		val.disk = D_OUTDATED;
+		break;
+	case 7:
+		if (fp != FP_STONITH)
+			drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
+		ex_to_string = "peer was stonithed";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_OUTDATED;
+		break;
+	default:
+		/* The script is broken ... */
+		drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+		return false; /* Eventually leave IO frozen */
+	}
+
+	drbd_info(connection, "fence-peer helper returned %d (%s)\n",
+		  (r>>8) & 0xff, ex_to_string);
+
+ out:
+
+	/* Not using
+	   conn_request_state(connection, mask, val, CS_VERBOSE);
+	   here, because we might were able to re-establish the connection in the
+	   meantime. */
+	spin_lock_irq(&connection->resource->req_lock);
+	if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
+		if (connection->connect_cnt != connect_cnt)
+			/* In case the connection was established and droped
+			   while the fence-peer handler was running, ignore it */
+			drbd_info(connection, "Ignoring fence-peer exit code\n");
+		else
+			_conn_request_state(connection, mask, val, CS_VERBOSE);
+	}
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	return conn_highest_pdsk(connection) <= D_OUTDATED;
+}
+
+static int _try_outdate_peer_async(void *data)
+{
+	struct drbd_connection *connection = (struct drbd_connection *)data;
+
+	conn_try_outdate_peer(connection);
+
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return 0;
+}
+
+void conn_try_outdate_peer_async(struct drbd_connection *connection)
+{
+	struct task_struct *opa;
+
+	kref_get(&connection->kref);
+	/* We may just have force_sig()'ed this thread
+	 * to get it out of some blocking network function.
+	 * Clear signals; otherwise kthread_run(), which internally uses
+	 * wait_on_completion_killable(), will mistake our pending signal
+	 * for a new fatal signal and fail. */
+	flush_signals(current);
+	opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
+	if (IS_ERR(opa)) {
+		drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n");
+		kref_put(&connection->kref, drbd_destroy_connection);
+	}
+}
+
+enum drbd_state_rv
+drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
+{
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+	const int max_tries = 4;
+	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+	struct net_conf *nc;
+	int try = 0;
+	int forced = 0;
+	union drbd_state mask, val;
+
+	if (new_role == R_PRIMARY) {
+		struct drbd_connection *connection;
+
+		/* Detect dead peers as soon as possible.  */
+
+		rcu_read_lock();
+		for_each_connection(connection, device->resource)
+			request_ping(connection);
+		rcu_read_unlock();
+	}
+
+	mutex_lock(device->state_mutex);
+
+	mask.i = 0; mask.role = R_MASK;
+	val.i  = 0; val.role  = new_role;
+
+	while (try++ < max_tries) {
+		rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE);
+
+		/* in case we first succeeded to outdate,
+		 * but now suddenly could establish a connection */
+		if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+			val.pdsk = 0;
+			mask.pdsk = 0;
+			continue;
+		}
+
+		if (rv == SS_NO_UP_TO_DATE_DISK && force &&
+		    (device->state.disk < D_UP_TO_DATE &&
+		     device->state.disk >= D_INCONSISTENT)) {
+			mask.disk = D_MASK;
+			val.disk  = D_UP_TO_DATE;
+			forced = 1;
+			continue;
+		}
+
+		if (rv == SS_NO_UP_TO_DATE_DISK &&
+		    device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+			D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
+
+			if (conn_try_outdate_peer(connection)) {
+				val.disk = D_UP_TO_DATE;
+				mask.disk = D_MASK;
+			}
+			continue;
+		}
+
+		if (rv == SS_NOTHING_TO_DO)
+			goto out;
+		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
+			if (!conn_try_outdate_peer(connection) && force) {
+				drbd_warn(device, "Forced into split brain situation!\n");
+				mask.pdsk = D_MASK;
+				val.pdsk  = D_OUTDATED;
+
+			}
+			continue;
+		}
+		if (rv == SS_TWO_PRIMARIES) {
+			/* Maybe the peer is detected as dead very soon...
+			   retry at most once more in this case. */
+			int timeo;
+			rcu_read_lock();
+			nc = rcu_dereference(connection->net_conf);
+			timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+			rcu_read_unlock();
+			schedule_timeout_interruptible(timeo);
+			if (try < max_tries)
+				try = max_tries - 1;
+			continue;
+		}
+		if (rv < SS_SUCCESS) {
+			rv = _drbd_request_state(device, mask, val,
+						CS_VERBOSE + CS_WAIT_COMPLETE);
+			if (rv < SS_SUCCESS)
+				goto out;
+		}
+		break;
+	}
+
+	if (rv < SS_SUCCESS)
+		goto out;
+
+	if (forced)
+		drbd_warn(device, "Forced to consider local data as UpToDate!\n");
+
+	/* Wait until nothing is on the fly :) */
+	wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
+
+	/* FIXME also wait for all pending P_BARRIER_ACK? */
+
+	if (new_role == R_SECONDARY) {
+		if (get_ldev(device)) {
+			device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+			put_ldev(device);
+		}
+	} else {
+		mutex_lock(&device->resource->conf_update);
+		nc = connection->net_conf;
+		if (nc)
+			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+		mutex_unlock(&device->resource->conf_update);
+
+		if (get_ldev(device)) {
+			if (((device->state.conn < C_CONNECTED ||
+			       device->state.pdsk <= D_FAILED)
+			      && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+				drbd_uuid_new_current(device);
+
+			device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+			put_ldev(device);
+		}
+	}
+
+	/* writeout of activity log covered areas of the bitmap
+	 * to stable storage done in after state change already */
+
+	if (device->state.conn >= C_WF_REPORT_PARAMS) {
+		/* if this was forced, we should consider sync */
+		if (forced)
+			drbd_send_uuids(peer_device);
+		drbd_send_current_state(peer_device);
+	}
+
+	drbd_md_sync(device);
+	set_disk_ro(device->vdisk, new_role == R_SECONDARY);
+	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+out:
+	mutex_unlock(device->state_mutex);
+	return rv;
+}
+
+static const char *from_attrs_err_to_txt(int err)
+{
+	return	err == -ENOMSG ? "required attribute missing" :
+		err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+		err == -EEXIST ? "can not change invariant setting" :
+		"invalid attribute value";
+}
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct set_role_parms parms;
+	int err;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	memset(&parms, 0, sizeof(parms));
+	if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+		err = set_role_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+	genl_unlock();
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+		retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
+	else
+		retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
+
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	genl_lock();
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ *   |----------- md_size_sect ------------------|
+ *   [ 4k superblock ][ activity log ][  Bitmap  ]
+ *   | al_offset == 8 |
+ *   | bm_offset = al_offset + X      |
+ *  ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ *            |----------- md_size_sect ------------------|
+ * [data.....][  Bitmap  ][ activity log ][ 4k superblock ]
+ *                        | al_offset < 0 |
+ *            | bm_offset = al_offset - Y |
+ *  ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ *  Activity log size used to be fixed 32kB,
+ *  but is about to become configurable.
+ */
+static void drbd_md_set_sector_offsets(struct drbd_device *device,
+				       struct drbd_backing_dev *bdev)
+{
+	sector_t md_size_sect = 0;
+	unsigned int al_size_sect = bdev->md.al_size_4k * 8;
+
+	bdev->md.md_offset = drbd_md_ss(bdev);
+
+	switch (bdev->md.meta_dev_idx) {
+	default:
+		/* v07 style fixed size indexed meta data */
+		bdev->md.md_size_sect = MD_128MB_SECT;
+		bdev->md.al_offset = MD_4kB_SECT;
+		bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		/* just occupy the full device; unit: sectors */
+		bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+		bdev->md.al_offset = MD_4kB_SECT;
+		bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+		break;
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		/* al size is still fixed */
+		bdev->md.al_offset = -al_size_sect;
+		/* we need (slightly less than) ~ this much bitmap sectors: */
+		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+		md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+		md_size_sect = ALIGN(md_size_sect, 8);
+
+		/* plus the "drbd meta data super block",
+		 * and the activity log; */
+		md_size_sect += MD_4kB_SECT + al_size_sect;
+
+		bdev->md.md_size_sect = md_size_sect;
+		/* bitmap offset is adjusted by 'super' block size */
+		bdev->md.bm_offset   = -md_size_sect + MD_4kB_SECT;
+		break;
+	}
+}
+
+/* input size is expected to be in KB */
+char *ppsize(char *buf, unsigned long long size)
+{
+	/* Needs 9 bytes at max including trailing NUL:
+	 * -1ULL ==> "16384 EB" */
+	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+	int base = 0;
+	while (size >= 10000 && base < sizeof(units)-1) {
+		/* shift + round */
+		size = (size >> 10) + !!(size & (1<<9));
+		base++;
+	}
+	sprintf(buf, "%u %cB", (unsigned)size, units[base]);
+
+	return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ *  remote READ does inc_ap_bio, receiver would need to receive answer
+ *  packet from remote to dec_ap_bio again.
+ *  receiver receive_sizes(), comes here,
+ *  waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ *  (not connected, or bad/no disk on peer):
+ *  see drbd_fail_request_early, ap_bio_cnt is zero.
+ *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ *  peer may not initiate a resize.
+ */
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an device->flag, is triggered by drbd internals,
+ * and should be short-lived. */
+void drbd_suspend_io(struct drbd_device *device)
+{
+	set_bit(SUSPEND_IO, &device->flags);
+	if (drbd_suspended(device))
+		return;
+	wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_device *device)
+{
+	clear_bit(SUSPEND_IO, &device->flags);
+	wake_up(&device->misc_wait);
+}
+
+/**
+ * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
+ * @device:	DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
+{
+	sector_t prev_first_sect, prev_size; /* previous meta location */
+	sector_t la_size_sect, u_size;
+	struct drbd_md *md = &device->ldev->md;
+	u32 prev_al_stripe_size_4k;
+	u32 prev_al_stripes;
+	sector_t size;
+	char ppb[10];
+	void *buffer;
+
+	int md_moved, la_size_changed;
+	enum determine_dev_size rv = DS_UNCHANGED;
+
+	/* race:
+	 * application request passes inc_ap_bio,
+	 * but then cannot get an AL-reference.
+	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+	 *
+	 * to avoid that:
+	 * Suspend IO right here.
+	 * still lock the act_log to not trigger ASSERTs there.
+	 */
+	drbd_suspend_io(device);
+	buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
+	if (!buffer) {
+		drbd_resume_io(device);
+		return DS_ERROR;
+	}
+
+	/* no wait necessary anymore, actually we could assert that */
+	wait_event(device->al_wait, lc_try_lock(device->act_log));
+
+	prev_first_sect = drbd_md_first_sector(device->ldev);
+	prev_size = device->ldev->md.md_size_sect;
+	la_size_sect = device->ldev->md.la_size_sect;
+
+	if (rs) {
+		/* rs is non NULL if we should change the AL layout only */
+
+		prev_al_stripes = md->al_stripes;
+		prev_al_stripe_size_4k = md->al_stripe_size_4k;
+
+		md->al_stripes = rs->al_stripes;
+		md->al_stripe_size_4k = rs->al_stripe_size / 4;
+		md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
+	}
+
+	drbd_md_set_sector_offsets(device, device->ldev);
+
+	rcu_read_lock();
+	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
+
+	if (size < la_size_sect) {
+		if (rs && u_size == 0) {
+			/* Remove "rs &&" later. This check should always be active, but
+			   right now the receiver expects the permissive behavior */
+			drbd_warn(device, "Implicit shrink not allowed. "
+				 "Use --size=%llus for explicit shrink.\n",
+				 (unsigned long long)size);
+			rv = DS_ERROR_SHRINK;
+		}
+		if (u_size > size)
+			rv = DS_ERROR_SPACE_MD;
+		if (rv != DS_UNCHANGED)
+			goto err_out;
+	}
+
+	if (drbd_get_capacity(device->this_bdev) != size ||
+	    drbd_bm_capacity(device) != size) {
+		int err;
+		err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
+		if (unlikely(err)) {
+			/* currently there is only one error: ENOMEM! */
+			size = drbd_bm_capacity(device)>>1;
+			if (size == 0) {
+				drbd_err(device, "OUT OF MEMORY! "
+				    "Could not allocate bitmap!\n");
+			} else {
+				drbd_err(device, "BM resizing failed. "
+				    "Leaving size unchanged at size = %lu KB\n",
+				    (unsigned long)size);
+			}
+			rv = DS_ERROR;
+		}
+		/* racy, see comments above. */
+		drbd_set_my_capacity(device, size);
+		device->ldev->md.la_size_sect = size;
+		drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+		     (unsigned long long)size>>1);
+	}
+	if (rv <= DS_ERROR)
+		goto err_out;
+
+	la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+
+	md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
+		|| prev_size	   != device->ldev->md.md_size_sect;
+
+	if (la_size_changed || md_moved || rs) {
+		u32 prev_flags;
+
+		/* We do some synchronous IO below, which may take some time.
+		 * Clear the timer, to avoid scary "timer expired!" messages,
+		 * "Superblock" is written out at least twice below, anyways. */
+		del_timer(&device->md_sync_timer);
+		drbd_al_shrink(device); /* All extents inactive. */
+
+		prev_flags = md->flags;
+		md->flags &= ~MDF_PRIMARY_IND;
+		drbd_md_write(device, buffer);
+
+		drbd_info(device, "Writing the whole bitmap, %s\n",
+			 la_size_changed && md_moved ? "size changed and md moved" :
+			 la_size_changed ? "size changed" : "md moved");
+		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
+		drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+			       "size changed", BM_LOCKED_MASK);
+		drbd_initialize_al(device, buffer);
+
+		md->flags = prev_flags;
+		drbd_md_write(device, buffer);
+
+		if (rs)
+			drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
+				  md->al_stripes, md->al_stripe_size_4k * 4);
+	}
+
+	if (size > la_size_sect)
+		rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+	if (size < la_size_sect)
+		rv = DS_SHRUNK;
+
+	if (0) {
+	err_out:
+		if (rs) {
+			md->al_stripes = prev_al_stripes;
+			md->al_stripe_size_4k = prev_al_stripe_size_4k;
+			md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
+
+			drbd_md_set_sector_offsets(device, device->ldev);
+		}
+	}
+	lc_unlock(device->act_log);
+	wake_up(&device->al_wait);
+	drbd_md_put_buffer(device);
+	drbd_resume_io(device);
+
+	return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
+		  sector_t u_size, int assume_peer_has_space)
+{
+	sector_t p_size = device->p_size;   /* partner's disk size. */
+	sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
+	sector_t m_size; /* my size */
+	sector_t size = 0;
+
+	m_size = drbd_get_max_capacity(bdev);
+
+	if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
+		drbd_warn(device, "Resize while not connected was forced by the user!\n");
+		p_size = m_size;
+	}
+
+	if (p_size && m_size) {
+		size = min_t(sector_t, p_size, m_size);
+	} else {
+		if (la_size_sect) {
+			size = la_size_sect;
+			if (m_size && m_size < size)
+				size = m_size;
+			if (p_size && p_size < size)
+				size = p_size;
+		} else {
+			if (m_size)
+				size = m_size;
+			if (p_size)
+				size = p_size;
+		}
+	}
+
+	if (size == 0)
+		drbd_err(device, "Both nodes diskless!\n");
+
+	if (u_size) {
+		if (u_size > size)
+			drbd_err(device, "Requested disk size is too big (%lu > %lu)\n",
+			    (unsigned long)u_size>>1, (unsigned long)size>>1);
+		else
+			size = u_size;
+	}
+
+	return size;
+}
+
+/**
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @device:	DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
+{
+	struct lru_cache *n, *t;
+	struct lc_element *e;
+	unsigned int in_use;
+	int i;
+
+	if (device->act_log &&
+	    device->act_log->nr_elements == dc->al_extents)
+		return 0;
+
+	in_use = 0;
+	t = device->act_log;
+	n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+		dc->al_extents, sizeof(struct lc_element), 0);
+
+	if (n == NULL) {
+		drbd_err(device, "Cannot allocate act_log lru!\n");
+		return -ENOMEM;
+	}
+	spin_lock_irq(&device->al_lock);
+	if (t) {
+		for (i = 0; i < t->nr_elements; i++) {
+			e = lc_element_by_index(t, i);
+			if (e->refcnt)
+				drbd_err(device, "refcnt(%d)==%d\n",
+				    e->lc_number, e->refcnt);
+			in_use += e->refcnt;
+		}
+	}
+	if (!in_use)
+		device->act_log = n;
+	spin_unlock_irq(&device->al_lock);
+	if (in_use) {
+		drbd_err(device, "Activity log still in use!\n");
+		lc_destroy(n);
+		return -EBUSY;
+	} else {
+		if (t)
+			lc_destroy(t);
+	}
+	drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
+	return 0;
+}
+
+static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
+				   unsigned int max_bio_size)
+{
+	struct request_queue * const q = device->rq_queue;
+	unsigned int max_hw_sectors = max_bio_size >> 9;
+	unsigned int max_segments = 0;
+	struct request_queue *b = NULL;
+
+	if (bdev) {
+		b = bdev->backing_bdev->bd_disk->queue;
+
+		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+		rcu_read_lock();
+		max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
+		rcu_read_unlock();
+
+		blk_set_stacking_limits(&q->limits);
+		blk_queue_max_write_same_sectors(q, 0);
+	}
+
+	blk_queue_logical_block_size(q, 512);
+	blk_queue_max_hw_sectors(q, max_hw_sectors);
+	/* This is the workaround for "bio would need to, but cannot, be split" */
+	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
+	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
+
+	if (b) {
+		struct drbd_connection *connection = first_peer_device(device)->connection;
+
+		if (blk_queue_discard(b) &&
+		    (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
+			/* For now, don't allow more than one activity log extent worth of data
+			 * to be discarded in one go. We may need to rework drbd_al_begin_io()
+			 * to allow for even larger discard ranges */
+			q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
+
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+			/* REALLY? Is stacking secdiscard "legal"? */
+			if (blk_queue_secdiscard(b))
+				queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
+		} else {
+			q->limits.max_discard_sectors = 0;
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+			queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+		}
+
+		blk_queue_stack_limits(q, b);
+
+		if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+			drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+				 q->backing_dev_info.ra_pages,
+				 b->backing_dev_info.ra_pages);
+			q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+		}
+	}
+}
+
+void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+	unsigned int now, new, local, peer;
+
+	now = queue_max_hw_sectors(device->rq_queue) << 9;
+	local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
+	peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
+
+	if (bdev) {
+		local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
+		device->local_max_bio_size = local;
+	}
+	local = min(local, DRBD_MAX_BIO_SIZE);
+
+	/* We may ignore peer limits if the peer is modern enough.
+	   Because new from 8.3.8 onwards the peer can use multiple
+	   BIOs for a single peer_request */
+	if (device->state.conn >= C_WF_REPORT_PARAMS) {
+		if (first_peer_device(device)->connection->agreed_pro_version < 94)
+			peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+		else if (first_peer_device(device)->connection->agreed_pro_version == 94)
+			peer = DRBD_MAX_SIZE_H80_PACKET;
+		else if (first_peer_device(device)->connection->agreed_pro_version < 100)
+			peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
+		else
+			peer = DRBD_MAX_BIO_SIZE;
+
+		/* We may later detach and re-attach on a disconnected Primary.
+		 * Avoid this setting to jump back in that case.
+		 * We want to store what we know the peer DRBD can handle,
+		 * not what the peer IO backend can handle. */
+		if (peer > device->peer_max_bio_size)
+			device->peer_max_bio_size = peer;
+	}
+	new = min(local, peer);
+
+	if (device->state.role == R_PRIMARY && new < now)
+		drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
+
+	if (new != now)
+		drbd_info(device, "max BIO size = %u\n", new);
+
+	drbd_setup_queue_param(device, bdev, new);
+}
+
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_connection *connection)
+{
+	drbd_thread_start(&connection->worker);
+	drbd_flush_workqueue(&connection->sender_work);
+}
+
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_connection *connection)
+{
+	bool stop_threads;
+	spin_lock_irq(&connection->resource->req_lock);
+	stop_threads = conn_all_vols_unconf(connection) &&
+		connection->cstate == C_STANDALONE;
+	spin_unlock_irq(&connection->resource->req_lock);
+	if (stop_threads) {
+		/* asender is implicitly stopped by receiver
+		 * in conn_disconnect() */
+		drbd_thread_stop(&connection->receiver);
+		drbd_thread_stop(&connection->worker);
+	}
+}
+
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_device *device)
+{
+	int s = 0;
+
+	if (!lc_try_lock(device->act_log)) {
+		drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n");
+		return;
+	}
+
+	drbd_al_shrink(device);
+	spin_lock_irq(&device->resource->req_lock);
+	if (device->state.conn < C_CONNECTED)
+		s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
+	spin_unlock_irq(&device->resource->req_lock);
+	lc_unlock(device->act_log);
+
+	if (s)
+		drbd_info(device, "Suspended AL updates\n");
+}
+
+
+static bool should_set_defaults(struct genl_info *info)
+{
+	unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+	return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
+{
+	/* This is limited by 16 bit "slot" numbers,
+	 * and by available on-disk context storage.
+	 *
+	 * Also (u16)~0 is special (denotes a "free" extent).
+	 *
+	 * One transaction occupies one 4kB on-disk block,
+	 * we have n such blocks in the on disk ring buffer,
+	 * the "current" transaction may fail (n-1),
+	 * and there is 919 slot numbers context information per transaction.
+	 *
+	 * 72 transaction blocks amounts to more than 2**16 context slots,
+	 * so cap there first.
+	 */
+	const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+	const unsigned int sufficient_on_disk =
+		(max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+		/AL_CONTEXT_PER_TRANSACTION;
+
+	unsigned int al_size_4k = bdev->md.al_size_4k;
+
+	if (al_size_4k > sufficient_on_disk)
+		return max_al_nr;
+
+	return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
+}
+
+static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
+{
+	return	a->disk_barrier != b->disk_barrier ||
+		a->disk_flushes != b->disk_flushes ||
+		a->disk_drain != b->disk_drain;
+}
+
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct drbd_device *device;
+	struct disk_conf *new_disk_conf, *old_disk_conf;
+	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+	int err, fifo_size;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	device = adm_ctx.device;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* we also need a disk
+	 * to change the options on */
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	mutex_lock(&device->resource->conf_update);
+	old_disk_conf = device->ldev->disk_conf;
+	*new_disk_conf = *old_disk_conf;
+	if (should_set_defaults(info))
+		set_disk_conf_defaults(new_disk_conf);
+
+	err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail_unlock;
+	}
+
+	if (!expect(new_disk_conf->resync_rate >= 1))
+		new_disk_conf->resync_rate = 1;
+
+	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+	if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
+		new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
+
+	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+	fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+	if (fifo_size != device->rs_plan_s->size) {
+		new_plan = fifo_alloc(fifo_size);
+		if (!new_plan) {
+			drbd_err(device, "kmalloc of fifo_buffer failed");
+			retcode = ERR_NOMEM;
+			goto fail_unlock;
+		}
+	}
+
+	drbd_suspend_io(device);
+	wait_event(device->al_wait, lc_try_lock(device->act_log));
+	drbd_al_shrink(device);
+	err = drbd_check_al_size(device, new_disk_conf);
+	lc_unlock(device->act_log);
+	wake_up(&device->al_wait);
+	drbd_resume_io(device);
+
+	if (err) {
+		retcode = ERR_NOMEM;
+		goto fail_unlock;
+	}
+
+	write_lock_irq(&global_state_lock);
+	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+	if (retcode == NO_ERROR) {
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		drbd_resync_after_changed(device);
+	}
+	write_unlock_irq(&global_state_lock);
+
+	if (retcode != NO_ERROR)
+		goto fail_unlock;
+
+	if (new_plan) {
+		old_plan = device->rs_plan_s;
+		rcu_assign_pointer(device->rs_plan_s, new_plan);
+	}
+
+	mutex_unlock(&device->resource->conf_update);
+
+	if (new_disk_conf->al_updates)
+		device->ldev->md.flags &= ~MDF_AL_DISABLED;
+	else
+		device->ldev->md.flags |= MDF_AL_DISABLED;
+
+	if (new_disk_conf->md_flushes)
+		clear_bit(MD_NO_FUA, &device->flags);
+	else
+		set_bit(MD_NO_FUA, &device->flags);
+
+	if (write_ordering_changed(old_disk_conf, new_disk_conf))
+		drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
+
+	drbd_md_sync(device);
+
+	if (device->state.conn >= C_CONNECTED) {
+		struct drbd_peer_device *peer_device;
+
+		for_each_peer_device(peer_device, device)
+			drbd_send_sync_param(peer_device);
+	}
+
+	synchronize_rcu();
+	kfree(old_disk_conf);
+	kfree(old_plan);
+	mod_timer(&device->request_timer, jiffies + HZ);
+	goto success;
+
+fail_unlock:
+	mutex_unlock(&device->resource->conf_update);
+ fail:
+	kfree(new_disk_conf);
+	kfree(new_plan);
+success:
+	put_ldev(device);
+ out:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	struct drbd_peer_device *peer_device;
+	struct drbd_connection *connection;
+	int err;
+	enum drbd_ret_code retcode;
+	enum determine_dev_size dd;
+	sector_t max_possible_sectors;
+	sector_t min_md_device_sectors;
+	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+	struct disk_conf *new_disk_conf = NULL;
+	struct block_device *bdev;
+	struct lru_cache *resync_lru = NULL;
+	struct fifo_buffer *new_plan = NULL;
+	union drbd_state ns, os;
+	enum drbd_state_rv rv;
+	struct net_conf *nc;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	device = adm_ctx.device;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	peer_device = first_peer_device(device);
+	connection = peer_device ? peer_device->connection : NULL;
+	conn_reconfig_start(connection);
+
+	/* if you want to reconfigure, please tear down first */
+	if (device->state.disk > D_DISKLESS) {
+		retcode = ERR_DISK_CONFIGURED;
+		goto fail;
+	}
+	/* It may just now have detached because of IO error.  Make sure
+	 * drbd_ldev_destroy is done already, we may end up here very fast,
+	 * e.g. if someone calls attach from the on-io-error handler,
+	 * to realize a "hot spare" feature (not that I'd recommend that) */
+	wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
+
+	/* make sure there is no leftover from previous force-detach attempts */
+	clear_bit(FORCE_DETACH, &device->flags);
+	clear_bit(WAS_IO_ERROR, &device->flags);
+	clear_bit(WAS_READ_ERROR, &device->flags);
+
+	/* and no leftover from previously aborted resync or verify, either */
+	device->rs_total = 0;
+	device->rs_failed = 0;
+	atomic_set(&device->rs_pending_cnt, 0);
+
+	/* allocation not in the IO path, drbdsetup context */
+	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+	if (!nbc) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	spin_lock_init(&nbc->md.uuid_lock);
+
+	new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	nbc->disk_conf = new_disk_conf;
+
+	set_disk_conf_defaults(new_disk_conf);
+	err = disk_conf_from_attrs(new_disk_conf, info);
+	if (err) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+	new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+	if (!new_plan) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	write_lock_irq(&global_state_lock);
+	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+	write_unlock_irq(&global_state_lock);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (nc) {
+		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+			rcu_read_unlock();
+			retcode = ERR_STONITH_AND_PROT_A;
+			goto fail;
+		}
+	}
+	rcu_read_unlock();
+
+	bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
+	if (IS_ERR(bdev)) {
+		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
+			PTR_ERR(bdev));
+		retcode = ERR_OPEN_DISK;
+		goto fail;
+	}
+	nbc->backing_bdev = bdev;
+
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				  (new_disk_conf->meta_dev_idx < 0) ?
+				  (void *)device : (void *)drbd_m_holder);
+	if (IS_ERR(bdev)) {
+		drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
+			PTR_ERR(bdev));
+		retcode = ERR_OPEN_MD_DISK;
+		goto fail;
+	}
+	nbc->md_bdev = bdev;
+
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	resync_lru = lc_create("resync", drbd_bm_ext_cache,
+			1, 61, sizeof(struct bm_extent),
+			offsetof(struct bm_extent, lce));
+	if (!resync_lru) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	/* Read our meta data super block early.
+	 * This also sets other on-disk offsets. */
+	retcode = drbd_md_read(device, nbc);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+	if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+		new_disk_conf->al_extents = drbd_al_extents_max(nbc);
+
+	if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
+		drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
+			(unsigned long long) drbd_get_max_capacity(nbc),
+			(unsigned long long) new_disk_conf->disk_size);
+		retcode = ERR_DISK_TOO_SMALL;
+		goto fail;
+	}
+
+	if (new_disk_conf->meta_dev_idx < 0) {
+		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+		/* at least one MB, otherwise it does not make sense */
+		min_md_device_sectors = (2<<10);
+	} else {
+		max_possible_sectors = DRBD_MAX_SECTORS;
+		min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
+	}
+
+	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+		retcode = ERR_MD_DISK_TOO_SMALL;
+		drbd_warn(device, "refusing attach: md-device too small, "
+		     "at least %llu sectors needed for this meta-disk type\n",
+		     (unsigned long long) min_md_device_sectors);
+		goto fail;
+	}
+
+	/* Make sure the new disk is big enough
+	 * (we may currently be R_PRIMARY with no local disk...) */
+	if (drbd_get_max_capacity(nbc) <
+	    drbd_get_capacity(device->this_bdev)) {
+		retcode = ERR_DISK_TOO_SMALL;
+		goto fail;
+	}
+
+	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+	if (nbc->known_size > max_possible_sectors) {
+		drbd_warn(device, "==> truncating very big lower level device "
+			"to currently maximum possible %llu sectors <==\n",
+			(unsigned long long) max_possible_sectors);
+		if (new_disk_conf->meta_dev_idx >= 0)
+			drbd_warn(device, "==>> using internal or flexible "
+				      "meta data may help <<==\n");
+	}
+
+	drbd_suspend_io(device);
+	/* also wait for the last barrier ack. */
+	/* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+	 * We need a way to either ignore barrier acks for barriers sent before a device
+	 * was attached, or a way to wait for all pending barrier acks to come in.
+	 * As barriers are counted per resource,
+	 * we'd need to suspend io on all devices of a resource.
+	 */
+	wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
+	/* and for any other previously queued work */
+	drbd_flush_workqueue(&connection->sender_work);
+
+	rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
+	retcode = rv;  /* FIXME: Type mismatch. */
+	drbd_resume_io(device);
+	if (rv < SS_SUCCESS)
+		goto fail;
+
+	if (!get_ldev_if_state(device, D_ATTACHING))
+		goto force_diskless;
+
+	if (!device->bitmap) {
+		if (drbd_bm_init(device)) {
+			retcode = ERR_NOMEM;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (device->state.conn < C_CONNECTED &&
+	    device->state.role == R_PRIMARY && device->ed_uuid &&
+	    (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+		drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
+		    (unsigned long long)device->ed_uuid);
+		retcode = ERR_DATA_NOT_CURRENT;
+		goto force_diskless_dec;
+	}
+
+	/* Since we are diskless, fix the activity log first... */
+	if (drbd_check_al_size(device, new_disk_conf)) {
+		retcode = ERR_NOMEM;
+		goto force_diskless_dec;
+	}
+
+	/* Prevent shrinking of consistent devices ! */
+	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
+	    drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
+		drbd_warn(device, "refusing to truncate a consistent device\n");
+		retcode = ERR_DISK_TOO_SMALL;
+		goto force_diskless_dec;
+	}
+
+	/* Reset the "barriers don't work" bits here, then force meta data to
+	 * be written, to ensure we determine if barriers are supported. */
+	if (new_disk_conf->md_flushes)
+		clear_bit(MD_NO_FUA, &device->flags);
+	else
+		set_bit(MD_NO_FUA, &device->flags);
+
+	/* Point of no return reached.
+	 * Devices and memory are no longer released by error cleanup below.
+	 * now device takes over responsibility, and the state engine should
+	 * clean it up somewhere.  */
+	D_ASSERT(device, device->ldev == NULL);
+	device->ldev = nbc;
+	device->resync = resync_lru;
+	device->rs_plan_s = new_plan;
+	nbc = NULL;
+	resync_lru = NULL;
+	new_disk_conf = NULL;
+	new_plan = NULL;
+
+	drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
+
+	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
+		set_bit(CRASHED_PRIMARY, &device->flags);
+	else
+		clear_bit(CRASHED_PRIMARY, &device->flags);
+
+	if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+	    !(device->state.role == R_PRIMARY && device->resource->susp_nod))
+		set_bit(CRASHED_PRIMARY, &device->flags);
+
+	device->send_cnt = 0;
+	device->recv_cnt = 0;
+	device->read_cnt = 0;
+	device->writ_cnt = 0;
+
+	drbd_reconsider_max_bio_size(device, device->ldev);
+
+	/* If I am currently not R_PRIMARY,
+	 * but meta data primary indicator is set,
+	 * I just now recover from a hard crash,
+	 * and have been R_PRIMARY before that crash.
+	 *
+	 * Now, if I had no connection before that crash
+	 * (have been degraded R_PRIMARY), chances are that
+	 * I won't find my peer now either.
+	 *
+	 * In that case, and _only_ in that case,
+	 * we use the degr-wfc-timeout instead of the default,
+	 * so we can automatically recover from a crash of a
+	 * degraded but active "cluster" after a certain timeout.
+	 */
+	clear_bit(USE_DEGR_WFC_T, &device->flags);
+	if (device->state.role != R_PRIMARY &&
+	     drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+	    !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
+		set_bit(USE_DEGR_WFC_T, &device->flags);
+
+	dd = drbd_determine_dev_size(device, 0, NULL);
+	if (dd <= DS_ERROR) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto force_diskless_dec;
+	} else if (dd == DS_GREW)
+		set_bit(RESYNC_AFTER_NEG, &device->flags);
+
+	if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
+	    (test_bit(CRASHED_PRIMARY, &device->flags) &&
+	     drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
+		drbd_info(device, "Assuming that all blocks are out of sync "
+		     "(aka FullSync)\n");
+		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+			"set_n_write from attaching", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	} else {
+		if (drbd_bitmap_io(device, &drbd_bm_read,
+			"read from attaching", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
+		drbd_suspend_al(device); /* IO is still suspended here... */
+
+	spin_lock_irq(&device->resource->req_lock);
+	os = drbd_read_state(device);
+	ns = os;
+	/* If MDF_CONSISTENT is not set go into inconsistent state,
+	   otherwise investigate MDF_WasUpToDate...
+	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+	   otherwise into D_CONSISTENT state.
+	*/
+	if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
+		if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
+			ns.disk = D_CONSISTENT;
+		else
+			ns.disk = D_OUTDATED;
+	} else {
+		ns.disk = D_INCONSISTENT;
+	}
+
+	if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
+		ns.pdsk = D_OUTDATED;
+
+	rcu_read_lock();
+	if (ns.disk == D_CONSISTENT &&
+	    (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
+		ns.disk = D_UP_TO_DATE;
+
+	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+	   this point, because drbd_request_state() modifies these
+	   flags. */
+
+	if (rcu_dereference(device->ldev->disk_conf)->al_updates)
+		device->ldev->md.flags &= ~MDF_AL_DISABLED;
+	else
+		device->ldev->md.flags |= MDF_AL_DISABLED;
+
+	rcu_read_unlock();
+
+	/* In case we are C_CONNECTED postpone any decision on the new disk
+	   state after the negotiation phase. */
+	if (device->state.conn == C_CONNECTED) {
+		device->new_state_tmp.i = ns.i;
+		ns.i = os.i;
+		ns.disk = D_NEGOTIATING;
+
+		/* We expect to receive up-to-date UUIDs soon.
+		   To avoid a race in receive_state, free p_uuid while
+		   holding req_lock. I.e. atomic with the state change */
+		kfree(device->p_uuid);
+		device->p_uuid = NULL;
+	}
+
+	rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (rv < SS_SUCCESS)
+		goto force_diskless_dec;
+
+	mod_timer(&device->request_timer, jiffies + HZ);
+
+	if (device->state.role == R_PRIMARY)
+		device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+	else
+		device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+	drbd_md_mark_dirty(device);
+	drbd_md_sync(device);
+
+	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+	put_ldev(device);
+	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+
+ force_diskless_dec:
+	put_ldev(device);
+ force_diskless:
+	drbd_force_state(device, NS(disk, D_DISKLESS));
+	drbd_md_sync(device);
+ fail:
+	conn_reconfig_done(connection);
+	if (nbc) {
+		if (nbc->backing_bdev)
+			blkdev_put(nbc->backing_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		if (nbc->md_bdev)
+			blkdev_put(nbc->md_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		kfree(nbc);
+	}
+	kfree(new_disk_conf);
+	lc_destroy(resync_lru);
+	kfree(new_plan);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int adm_detach(struct drbd_device *device, int force)
+{
+	enum drbd_state_rv retcode;
+	int ret;
+
+	if (force) {
+		set_bit(FORCE_DETACH, &device->flags);
+		drbd_force_state(device, NS(disk, D_FAILED));
+		retcode = SS_SUCCESS;
+		goto out;
+	}
+
+	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
+	drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
+	retcode = drbd_request_state(device, NS(disk, D_FAILED));
+	drbd_md_put_buffer(device);
+	/* D_FAILED will transition to DISKLESS. */
+	ret = wait_event_interruptible(device->misc_wait,
+			device->state.disk != D_FAILED);
+	drbd_resume_io(device);
+	if ((int)retcode == (int)SS_IS_DISKLESS)
+		retcode = SS_NOTHING_TO_DO;
+	if (ret)
+		retcode = ERR_INTR;
+out:
+	return retcode;
+}
+
+/* Detaching the disk is a process in multiple stages.  First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct detach_parms parms = { };
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+		err = detach_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = adm_detach(adm_ctx.device, parms.force_detach);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static bool conn_resync_running(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.conn == C_SYNC_SOURCE ||
+		    device->state.conn == C_SYNC_TARGET ||
+		    device->state.conn == C_PAUSED_SYNC_S ||
+		    device->state.conn == C_PAUSED_SYNC_T) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static bool conn_ov_running(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.conn == C_VERIFY_S ||
+		    device->state.conn == C_VERIFY_T) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static enum drbd_ret_code
+_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf)
+{
+	struct drbd_peer_device *peer_device;
+	int i;
+
+	if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
+		if (new_net_conf->wire_protocol != old_net_conf->wire_protocol)
+			return ERR_NEED_APV_100;
+
+		if (new_net_conf->two_primaries != old_net_conf->two_primaries)
+			return ERR_NEED_APV_100;
+
+		if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg))
+			return ERR_NEED_APV_100;
+	}
+
+	if (!new_net_conf->two_primaries &&
+	    conn_highest_role(connection) == R_PRIMARY &&
+	    conn_highest_peer(connection) == R_PRIMARY)
+		return ERR_NEED_ALLOW_TWO_PRI;
+
+	if (new_net_conf->two_primaries &&
+	    (new_net_conf->wire_protocol != DRBD_PROT_C))
+		return ERR_NOT_PROTO_C;
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		if (get_ldev(device)) {
+			enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+			put_ldev(device);
+			if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+				return ERR_STONITH_AND_PROT_A;
+		}
+		if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data)
+			return ERR_DISCARD_IMPOSSIBLE;
+	}
+
+	if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A)
+		return ERR_CONG_NOT_PROTO_A;
+
+	return NO_ERROR;
+}
+
+static enum drbd_ret_code
+check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf)
+{
+	static enum drbd_ret_code rv;
+	struct drbd_peer_device *peer_device;
+	int i;
+
+	rcu_read_lock();
+	rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
+	rcu_read_unlock();
+
+	/* connection->peer_devices protected by genl_lock() here */
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		if (!device->bitmap) {
+			if (drbd_bm_init(device))
+				return ERR_NOMEM;
+		}
+	}
+
+	return rv;
+}
+
+struct crypto {
+	struct crypto_hash *verify_tfm;
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_tfm;
+};
+
+static int
+alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+{
+	if (!tfm_name[0])
+		return NO_ERROR;
+
+	*tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(*tfm)) {
+		*tfm = NULL;
+		return err_alg;
+	}
+
+	return NO_ERROR;
+}
+
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf)
+{
+	char hmac_name[CRYPTO_MAX_ALG_NAME];
+	enum drbd_ret_code rv;
+
+	rv = alloc_hash(&crypto->csums_tfm, new_net_conf->csums_alg,
+		       ERR_CSUMS_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	rv = alloc_hash(&crypto->verify_tfm, new_net_conf->verify_alg,
+		       ERR_VERIFY_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	rv = alloc_hash(&crypto->integrity_tfm, new_net_conf->integrity_alg,
+		       ERR_INTEGRITY_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	if (new_net_conf->cram_hmac_alg[0] != 0) {
+		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+			 new_net_conf->cram_hmac_alg);
+
+		rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+			       ERR_AUTH_ALG);
+	}
+
+	return rv;
+}
+
+static void free_crypto(struct crypto *crypto)
+{
+	crypto_free_hash(crypto->cram_hmac_tfm);
+	crypto_free_hash(crypto->integrity_tfm);
+	crypto_free_hash(crypto->csums_tfm);
+	crypto_free_hash(crypto->verify_tfm);
+}
+
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct drbd_connection *connection;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	int err;
+	int ovr; /* online verify running */
+	int rsr; /* re-sync running */
+	struct crypto crypto = { };
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	connection = adm_ctx.connection;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		retcode = ERR_NOMEM;
+		goto out;
+	}
+
+	conn_reconfig_start(connection);
+
+	mutex_lock(&connection->data.mutex);
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = connection->net_conf;
+
+	if (!old_net_conf) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect");
+		retcode = ERR_INVALID_REQUEST;
+		goto fail;
+	}
+
+	*new_net_conf = *old_net_conf;
+	if (should_set_defaults(info))
+		set_net_conf_defaults(new_net_conf);
+
+	err = net_conf_from_attrs_for_change(new_net_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	retcode = check_net_options(connection, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	/* re-sync running */
+	rsr = conn_resync_running(connection);
+	if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) {
+		retcode = ERR_CSUMS_RESYNC_RUNNING;
+		goto fail;
+	}
+
+	/* online verify running */
+	ovr = conn_ov_running(connection);
+	if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) {
+		retcode = ERR_VERIFY_RUNNING;
+		goto fail;
+	}
+
+	retcode = alloc_crypto(&crypto, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+	if (!rsr) {
+		crypto_free_hash(connection->csums_tfm);
+		connection->csums_tfm = crypto.csums_tfm;
+		crypto.csums_tfm = NULL;
+	}
+	if (!ovr) {
+		crypto_free_hash(connection->verify_tfm);
+		connection->verify_tfm = crypto.verify_tfm;
+		crypto.verify_tfm = NULL;
+	}
+
+	crypto_free_hash(connection->integrity_tfm);
+	connection->integrity_tfm = crypto.integrity_tfm;
+	if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
+		/* Do this without trying to take connection->data.mutex again.  */
+		__drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
+
+	crypto_free_hash(connection->cram_hmac_tfm);
+	connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+	synchronize_rcu();
+	kfree(old_net_conf);
+
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		struct drbd_peer_device *peer_device;
+		int vnr;
+
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+			drbd_send_sync_param(peer_device);
+	}
+
+	goto done;
+
+ fail:
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+	free_crypto(&crypto);
+	kfree(new_net_conf);
+ done:
+	conn_reconfig_done(connection);
+ out:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_peer_device *peer_device;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	struct crypto crypto = { };
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	enum drbd_ret_code retcode;
+	int i;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+	if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+
+	/* No need for _rcu here. All reconfiguration is
+	 * strictly serialized on genl_lock(). We are protected against
+	 * concurrent reconfiguration/addition/deletion */
+	for_each_resource(resource, &drbd_resources) {
+		for_each_connection(connection, resource) {
+			if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
+			    !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr,
+				    connection->my_addr_len)) {
+				retcode = ERR_LOCAL_ADDR;
+				goto out;
+			}
+
+			if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
+			    !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr,
+				    connection->peer_addr_len)) {
+				retcode = ERR_PEER_ADDR;
+				goto out;
+			}
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	connection = first_connection(adm_ctx.resource);
+	conn_reconfig_start(connection);
+
+	if (connection->cstate > C_STANDALONE) {
+		retcode = ERR_NET_CONFIGURED;
+		goto fail;
+	}
+
+	/* allocation not in the IO path, drbdsetup / netlink process context */
+	new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	set_net_conf_defaults(new_net_conf);
+
+	err = net_conf_from_attrs(new_net_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	retcode = check_net_options(connection, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	retcode = alloc_crypto(&crypto, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+	drbd_flush_workqueue(&connection->sender_work);
+
+	mutex_lock(&adm_ctx.resource->conf_update);
+	old_net_conf = connection->net_conf;
+	if (old_net_conf) {
+		retcode = ERR_NET_CONFIGURED;
+		mutex_unlock(&adm_ctx.resource->conf_update);
+		goto fail;
+	}
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+	conn_free_crypto(connection);
+	connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+	connection->integrity_tfm = crypto.integrity_tfm;
+	connection->csums_tfm = crypto.csums_tfm;
+	connection->verify_tfm = crypto.verify_tfm;
+
+	connection->my_addr_len = nla_len(adm_ctx.my_addr);
+	memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
+	connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
+	memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
+
+	mutex_unlock(&adm_ctx.resource->conf_update);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		device->send_cnt = 0;
+		device->recv_cnt = 0;
+	}
+	rcu_read_unlock();
+
+	retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+
+fail:
+	free_crypto(&crypto);
+	kfree(new_net_conf);
+
+	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
+{
+	enum drbd_state_rv rv;
+
+	rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+			force ? CS_HARD : 0);
+
+	switch (rv) {
+	case SS_NOTHING_TO_DO:
+		break;
+	case SS_ALREADY_STANDALONE:
+		return SS_SUCCESS;
+	case SS_PRIMARY_NOP:
+		/* Our state checking code wants to see the peer outdated. */
+		rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
+
+		if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+			rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
+
+		break;
+	case SS_CW_FAILED_BY_PEER:
+		/* The peer probably wants to see us outdated. */
+		rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
+							disk, D_OUTDATED), 0);
+		if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+			rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+					CS_HARD);
+		}
+		break;
+	default:;
+		/* no special handling necessary */
+	}
+
+	if (rv >= SS_SUCCESS) {
+		enum drbd_state_rv rv2;
+		/* No one else can reconfigure the network while I am here.
+		 * The state handling only uses drbd_thread_stop_nowait(),
+		 * we want to really wait here until the receiver is no more.
+		 */
+		drbd_thread_stop(&connection->receiver);
+
+		/* Race breaker.  This additional state change request may be
+		 * necessary, if this was a forced disconnect during a receiver
+		 * restart.  We may have "killed" the receiver thread just
+		 * after drbd_receiver() returned.  Typically, we should be
+		 * C_STANDALONE already, now, and this becomes a no-op.
+		 */
+		rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
+				CS_VERBOSE | CS_HARD);
+		if (rv2 < SS_SUCCESS)
+			drbd_err(connection,
+				"unexpected rv2=%d in conn_try_disconnect()\n",
+				rv2);
+	}
+	return rv;
+}
+
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct disconnect_parms parms;
+	struct drbd_connection *connection;
+	enum drbd_state_rv rv;
+	enum drbd_ret_code retcode;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	connection = adm_ctx.connection;
+	memset(&parms, 0, sizeof(parms));
+	if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+		err = disconnect_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto fail;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	rv = conn_try_disconnect(connection, parms.force_disconnect);
+	if (rv < SS_SUCCESS)
+		retcode = rv;  /* FIXME: Type mismatch. */
+	else
+		retcode = NO_ERROR;
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ fail:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+void resync_after_online_grow(struct drbd_device *device)
+{
+	int iass; /* I am sync source */
+
+	drbd_info(device, "Resync of new storage after online grow\n");
+	if (device->state.role != device->state.peer)
+		iass = (device->state.role == R_PRIMARY);
+	else
+		iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+
+	if (iass)
+		drbd_start_resync(device, C_SYNC_SOURCE);
+	else
+		_drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+	struct resize_parms rs;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	enum determine_dev_size dd;
+	bool change_al_layout = false;
+	enum dds_flags ddsf;
+	sector_t u_size;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto fail;
+	}
+
+	memset(&rs, 0, sizeof(struct resize_parms));
+	rs.al_stripes = device->ldev->md.al_stripes;
+	rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
+	if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+		err = resize_parms_from_attrs(&rs, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto fail_ldev;
+		}
+	}
+
+	if (device->state.conn > C_CONNECTED) {
+		retcode = ERR_RESIZE_RESYNC;
+		goto fail_ldev;
+	}
+
+	if (device->state.role == R_SECONDARY &&
+	    device->state.peer == R_SECONDARY) {
+		retcode = ERR_NO_PRIMARY;
+		goto fail_ldev;
+	}
+
+	if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
+		retcode = ERR_NEED_APV_93;
+		goto fail_ldev;
+	}
+
+	rcu_read_lock();
+	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	if (u_size != (sector_t)rs.resize_size) {
+		new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
+			retcode = ERR_NOMEM;
+			goto fail_ldev;
+		}
+	}
+
+	if (device->ldev->md.al_stripes != rs.al_stripes ||
+	    device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
+		u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
+
+		if (al_size_k > (16 * 1024 * 1024)) {
+			retcode = ERR_MD_LAYOUT_TOO_BIG;
+			goto fail_ldev;
+		}
+
+		if (al_size_k < MD_32kB_SECT/2) {
+			retcode = ERR_MD_LAYOUT_TOO_SMALL;
+			goto fail_ldev;
+		}
+
+		if (device->state.conn != C_CONNECTED && !rs.resize_force) {
+			retcode = ERR_MD_LAYOUT_CONNECTED;
+			goto fail_ldev;
+		}
+
+		change_al_layout = true;
+	}
+
+	if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
+		device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+
+	if (new_disk_conf) {
+		mutex_lock(&device->resource->conf_update);
+		old_disk_conf = device->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+		new_disk_conf->disk_size = (sector_t)rs.resize_size;
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		mutex_unlock(&device->resource->conf_update);
+		synchronize_rcu();
+		kfree(old_disk_conf);
+	}
+
+	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+	dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
+	drbd_md_sync(device);
+	put_ldev(device);
+	if (dd == DS_ERROR) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto fail;
+	} else if (dd == DS_ERROR_SPACE_MD) {
+		retcode = ERR_MD_LAYOUT_NO_FIT;
+		goto fail;
+	} else if (dd == DS_ERROR_SHRINK) {
+		retcode = ERR_IMPLICIT_SHRINK;
+		goto fail;
+	}
+
+	if (device->state.conn == C_CONNECTED) {
+		if (dd == DS_GREW)
+			set_bit(RESIZE_PENDING, &device->flags);
+
+		drbd_send_uuids(first_peer_device(device));
+		drbd_send_sizes(first_peer_device(device), 1, ddsf);
+	}
+
+ fail:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+
+ fail_ldev:
+	put_ldev(device);
+	goto fail;
+}
+
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct res_opts res_opts;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	res_opts = adm_ctx.resource->res_opts;
+	if (should_set_defaults(info))
+		set_res_opts_defaults(&res_opts);
+
+	err = res_opts_from_attrs(&res_opts, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	err = set_resource_options(adm_ctx.resource, &res_opts);
+	if (err) {
+		retcode = ERR_INVALID_REQUEST;
+		if (err == -ENOMEM)
+			retcode = ERR_NOMEM;
+	}
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+
+fail:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync.
+	 * Also wait for it's after_state_ch(). */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+	/* If we happen to be C_STANDALONE R_SECONDARY, just change to
+	 * D_INCONSISTENT, and set all bits in the bitmap.  Otherwise,
+	 * try to start a resync handshake as sync target for full sync.
+	 */
+	if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
+		retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
+		if (retcode >= SS_SUCCESS) {
+			if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+				"set_n_write from invalidate", BM_LOCKED_MASK))
+				retcode = ERR_IO_MD_DISK;
+		}
+	} else
+		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	put_ldev(device);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+		union drbd_state mask, union drbd_state val)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = drbd_request_state(adm_ctx.device, mask, val);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
+{
+	int rv;
+
+	rv = drbd_bmio_set_n_write(device);
+	drbd_suspend_al(device);
+	return rv;
+}
+
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	int retcode; /* drbd_ret_code, drbd_state_rv */
+	struct drbd_device *device;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync.
+	 * Also wait for it's after_state_ch(). */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+	/* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
+	 * in the bitmap.  Otherwise, try to start a resync handshake
+	 * as sync source for full sync.
+	 */
+	if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
+		/* The peer will get a resync upon connect anyways. Just make that
+		   into a full resync. */
+		retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
+		if (retcode >= SS_SUCCESS) {
+			if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
+				"set_n_write from invalidate_peer",
+				BM_LOCKED_SET_ALLOWED))
+				retcode = ERR_IO_MD_DISK;
+		}
+	} else
+		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	put_ldev(device);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_SET;
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	union drbd_dev_state s;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+		s = adm_ctx.device->state;
+		if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
+			retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
+				  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
+		} else {
+			retcode = ERR_PAUSE_IS_CLEAR;
+		}
+	}
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
+{
+	return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
+}
+
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	device = adm_ctx.device;
+	if (test_bit(NEW_CUR_UUID, &device->flags)) {
+		drbd_uuid_new_current(device);
+		clear_bit(NEW_CUR_UUID, &device->flags);
+	}
+	drbd_suspend_io(device);
+	retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+	if (retcode == SS_SUCCESS) {
+		if (device->state.conn < C_CONNECTED)
+			tl_clear(first_peer_device(device)->connection);
+		if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
+			tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
+	}
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
+{
+	return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
+}
+
+static int nla_put_drbd_cfg_context(struct sk_buff *skb,
+				    struct drbd_resource *resource,
+				    struct drbd_connection *connection,
+				    struct drbd_device *device)
+{
+	struct nlattr *nla;
+	nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+	if (!nla)
+		goto nla_put_failure;
+	if (device &&
+	    nla_put_u32(skb, T_ctx_volume, device->vnr))
+		goto nla_put_failure;
+	if (nla_put_string(skb, T_ctx_resource_name, resource->name))
+		goto nla_put_failure;
+	if (connection) {
+		if (connection->my_addr_len &&
+		    nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
+			goto nla_put_failure;
+		if (connection->peer_addr_len &&
+		    nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
+			goto nla_put_failure;
+	}
+	nla_nest_end(skb, nla);
+	return 0;
+
+nla_put_failure:
+	if (nla)
+		nla_nest_cancel(skb, nla);
+	return -EMSGSIZE;
+}
+
+/*
+ * Return the connection of @resource if @resource has exactly one connection.
+ */
+static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
+{
+	struct list_head *connections = &resource->connections;
+
+	if (list_empty(connections) || connections->next->next != connections)
+		return NULL;
+	return list_first_entry(&resource->connections, struct drbd_connection, connections);
+}
+
+static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+		const struct sib_info *sib)
+{
+	struct drbd_resource *resource = device->resource;
+	struct state_info *si = NULL; /* for sizeof(si->member); */
+	struct nlattr *nla;
+	int got_ldev;
+	int err = 0;
+	int exclude_sensitive;
+
+	/* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+	 * to.  So we better exclude_sensitive information.
+	 *
+	 * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+	 * in the context of the requesting user process. Exclude sensitive
+	 * information, unless current has superuser.
+	 *
+	 * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+	 * relies on the current implementation of netlink_dump(), which
+	 * executes the dump callback successively from netlink_recvmsg(),
+	 * always in the context of the receiving process */
+	exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+	got_ldev = get_ldev(device);
+
+	/* We need to add connection name and volume number information still.
+	 * Minor number is in drbd_genlmsghdr. */
+	if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device))
+		goto nla_put_failure;
+
+	if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive))
+		goto nla_put_failure;
+
+	rcu_read_lock();
+	if (got_ldev) {
+		struct disk_conf *disk_conf;
+
+		disk_conf = rcu_dereference(device->ldev->disk_conf);
+		err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
+	}
+	if (!err) {
+		struct net_conf *nc;
+
+		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		if (nc)
+			err = net_conf_to_skb(skb, nc, exclude_sensitive);
+	}
+	rcu_read_unlock();
+	if (err)
+		goto nla_put_failure;
+
+	nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+	if (!nla)
+		goto nla_put_failure;
+	if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+	    nla_put_u32(skb, T_current_state, device->state.i) ||
+	    nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
+	    nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
+	    nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
+	    nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
+	    nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
+	    nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
+	    nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
+	    nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
+	    nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
+	    nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
+	    nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
+		goto nla_put_failure;
+
+	if (got_ldev) {
+		int err;
+
+		spin_lock_irq(&device->ldev->md.uuid_lock);
+		err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
+		spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+		if (err)
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
+		    nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
+		    nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
+			goto nla_put_failure;
+		if (C_SYNC_SOURCE <= device->state.conn &&
+		    C_PAUSED_SYNC_T >= device->state.conn) {
+			if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
+			    nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
+				goto nla_put_failure;
+		}
+	}
+
+	if (sib) {
+		switch(sib->sib_reason) {
+		case SIB_SYNC_PROGRESS:
+		case SIB_GET_STATUS_REPLY:
+			break;
+		case SIB_STATE_CHANGE:
+			if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+			    nla_put_u32(skb, T_new_state, sib->ns.i))
+				goto nla_put_failure;
+			break;
+		case SIB_HELPER_POST:
+			if (nla_put_u32(skb, T_helper_exit_code,
+					sib->helper_exit_code))
+				goto nla_put_failure;
+			/* fall through */
+		case SIB_HELPER_PRE:
+			if (nla_put_string(skb, T_helper, sib->helper_name))
+				goto nla_put_failure;
+			break;
+		}
+	}
+	nla_nest_end(skb, nla);
+
+	if (0)
+nla_put_failure:
+		err = -EMSGSIZE;
+	if (got_ldev)
+		put_ldev(device);
+	return err;
+}
+
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
+	if (err) {
+		nlmsg_free(adm_ctx.reply_skb);
+		return err;
+	}
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_device *device;
+	struct drbd_genlmsghdr *dh;
+	struct drbd_resource *pos = (struct drbd_resource *)cb->args[0];
+	struct drbd_resource *resource = NULL;
+	struct drbd_resource *tmp;
+	unsigned volume = cb->args[1];
+
+	/* Open coded, deferred, iteration:
+	 * for_each_resource_safe(resource, tmp, &drbd_resources) {
+	 *      connection = "first connection of resource or undefined";
+	 *	idr_for_each_entry(&resource->devices, device, i) {
+	 *	  ...
+	 *	}
+	 * }
+	 * where resource is cb->args[0];
+	 * and i is cb->args[1];
+	 *
+	 * cb->args[2] indicates if we shall loop over all resources,
+	 * or just dump all volumes of a single resource.
+	 *
+	 * This may miss entries inserted after this dump started,
+	 * or entries deleted before they are reached.
+	 *
+	 * We need to make sure the device won't disappear while
+	 * we are looking at it, and revalidate our iterators
+	 * on each iteration.
+	 */
+
+	/* synchronize with conn_create()/drbd_destroy_connection() */
+	rcu_read_lock();
+	/* revalidate iterator position */
+	for_each_resource_rcu(tmp, &drbd_resources) {
+		if (pos == NULL) {
+			/* first iteration */
+			pos = tmp;
+			resource = pos;
+			break;
+		}
+		if (tmp == pos) {
+			resource = pos;
+			break;
+		}
+	}
+	if (resource) {
+next_resource:
+		device = idr_get_next(&resource->devices, &volume);
+		if (!device) {
+			/* No more volumes to dump on this resource.
+			 * Advance resource iterator. */
+			pos = list_entry_rcu(resource->resources.next,
+					     struct drbd_resource, resources);
+			/* Did we dump any volume of this resource yet? */
+			if (volume != 0) {
+				/* If we reached the end of the list,
+				 * or only a single resource dump was requested,
+				 * we are done. */
+				if (&pos->resources == &drbd_resources || cb->args[2])
+					goto out;
+				volume = 0;
+				resource = pos;
+				goto next_resource;
+			}
+		}
+
+		dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq, &drbd_genl_family,
+				NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+		if (!dh)
+			goto out;
+
+		if (!device) {
+			/* This is a connection without a single volume.
+			 * Suprisingly enough, it may have a network
+			 * configuration. */
+			struct drbd_connection *connection;
+
+			dh->minor = -1U;
+			dh->ret_code = NO_ERROR;
+			connection = the_only_connection(resource);
+			if (nla_put_drbd_cfg_context(skb, resource, connection, NULL))
+				goto cancel;
+			if (connection) {
+				struct net_conf *nc;
+
+				nc = rcu_dereference(connection->net_conf);
+				if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+					goto cancel;
+			}
+			goto done;
+		}
+
+		D_ASSERT(device, device->vnr == volume);
+		D_ASSERT(device, device->resource == resource);
+
+		dh->minor = device_to_minor(device);
+		dh->ret_code = NO_ERROR;
+
+		if (nla_put_status_info(skb, device, NULL)) {
+cancel:
+			genlmsg_cancel(skb, dh);
+			goto out;
+		}
+done:
+		genlmsg_end(skb, dh);
+	}
+
+out:
+	rcu_read_unlock();
+	/* where to start the next iteration */
+	cb->args[0] = (long)pos;
+	cb->args[1] = (pos == resource) ? volume + 1 : 0;
+
+	/* No more resources/volumes/minors found results in an empty skb.
+	 * Which will terminate the dump. */
+        return skb->len;
+}
+
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock().  During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
+ */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+	struct nlattr *nla;
+	const char *resource_name;
+	struct drbd_resource *resource;
+	int maxtype;
+
+	/* Is this a followup call? */
+	if (cb->args[0]) {
+		/* ... of a single resource dump,
+		 * and the resource iterator has been advanced already? */
+		if (cb->args[2] && cb->args[2] != cb->args[0])
+			return 0; /* DONE. */
+		goto dump;
+	}
+
+	/* First call (from netlink_dump_start).  We need to figure out
+	 * which resource(s) the user wants us to dump. */
+	nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+			nlmsg_attrlen(cb->nlh, hdrlen),
+			DRBD_NLA_CFG_CONTEXT);
+
+	/* No explicit context given.  Dump all. */
+	if (!nla)
+		goto dump;
+	maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+	nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+	if (IS_ERR(nla))
+		return PTR_ERR(nla);
+	/* context given, but no name present? */
+	if (!nla)
+		return -EINVAL;
+	resource_name = nla_data(nla);
+	if (!*resource_name)
+		return -ENODEV;
+	resource = drbd_find_resource(resource_name);
+	if (!resource)
+		return -ENODEV;
+
+	kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */
+
+	/* prime iterators, and set "filter" mode mark:
+	 * only dump this connection. */
+	cb->args[0] = (long)resource;
+	/* cb->args[1] = 0; passed in this way. */
+	cb->args[2] = (long)resource;
+
+dump:
+	return get_one_status(skb, cb);
+}
+
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct timeout_parms tp;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	tp.timeout_type =
+		adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+		test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
+		UT_DEFAULT;
+
+	err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+	if (err) {
+		nlmsg_free(adm_ctx.reply_skb);
+		return err;
+	}
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	struct start_ov_parms parms;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	device = adm_ctx.device;
+
+	/* resume from last known position, if possible */
+	parms.ov_start_sector = device->ov_start_sector;
+	parms.ov_stop_sector = ULLONG_MAX;
+	if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+		int err = start_ov_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* w_make_ov_request expects position to be aligned */
+	device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+	device->ov_stop_sector = parms.ov_stop_sector;
+
+	/* If there is still bitmap IO pending, e.g. previous resync or verify
+	 * just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
+	drbd_resume_io(device);
+
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	int skip_initial_sync = 0;
+	int err;
+	struct new_c_uuid_parms args;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out_nolock;
+
+	device = adm_ctx.device;
+	memset(&args, 0, sizeof(args));
+	if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+		err = new_c_uuid_parms_from_attrs(&args, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out_nolock;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
+
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	/* this is "skip initial sync", assume to be clean */
+	if (device->state.conn == C_CONNECTED &&
+	    first_peer_device(device)->connection->agreed_pro_version >= 90 &&
+	    device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+		drbd_info(device, "Preparing to skip initial sync\n");
+		skip_initial_sync = 1;
+	} else if (device->state.conn != C_STANDALONE) {
+		retcode = ERR_CONNECTED;
+		goto out_dec;
+	}
+
+	drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+	drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
+
+	if (args.clear_bm) {
+		err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+			"clear_n_write from new_c_uuid", BM_LOCKED_MASK);
+		if (err) {
+			drbd_err(device, "Writing bitmap failed with %d\n", err);
+			retcode = ERR_IO_MD_DISK;
+		}
+		if (skip_initial_sync) {
+			drbd_send_uuids_skip_initial_sync(first_peer_device(device));
+			_drbd_uuid_set(device, UI_BITMAP, 0);
+			drbd_print_uuids(device, "cleared bitmap UUID");
+			spin_lock_irq(&device->resource->req_lock);
+			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+	}
+
+	drbd_md_sync(device);
+out_dec:
+	put_ldev(device);
+out:
+	mutex_unlock(device->state_mutex);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out_nolock:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_ret_code
+drbd_check_resource_name(struct drbd_config_context *adm_ctx)
+{
+	const char *name = adm_ctx->resource_name;
+	if (!name || !name[0]) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing");
+		return ERR_MANDATORY_TAG;
+	}
+	/* if we want to use these in sysfs/configfs/debugfs some day,
+	 * we must not allow slashes */
+	if (strchr(name, '/')) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name");
+		return ERR_INVALID_REQUEST;
+	}
+	return NO_ERROR;
+}
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct res_opts res_opts;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	set_res_opts_defaults(&res_opts);
+	err = res_opts_from_attrs(&res_opts, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto out;
+	}
+
+	retcode = drbd_check_resource_name(&adm_ctx);
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (adm_ctx.resource) {
+		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+			retcode = ERR_INVALID_REQUEST;
+			drbd_msg_put_info(adm_ctx.reply_skb, "resource exists");
+		}
+		/* else: still NO_ERROR */
+		goto out;
+	}
+
+	/* not yet safe for genl_family.parallel_ops */
+	if (!conn_create(adm_ctx.resource_name, &res_opts))
+		retcode = ERR_NOMEM;
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_genlmsghdr *dh = info->userhdr;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (dh->minor > MINORMASK) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+	if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+
+	/* drbd_adm_prepare made sure already
+	 * that first_peer_device(device)->connection and device->vnr match the request. */
+	if (adm_ctx.device) {
+		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+			retcode = ERR_MINOR_OR_VOLUME_EXISTS;
+		/* else: still NO_ERROR */
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = drbd_create_device(&adm_ctx, dh->minor);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
+{
+	if (device->state.disk == D_DISKLESS &&
+	    /* no need to be device->state.conn == C_STANDALONE &&
+	     * we may want to delete a minor from a live replication group.
+	     */
+	    device->state.role == R_SECONDARY) {
+		_drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
+				    CS_VERBOSE + CS_WAIT_COMPLETE);
+		drbd_delete_device(device);
+		return NO_ERROR;
+	} else
+		return ERR_MINOR_CONFIGURED;
+}
+
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = adm_del_minor(adm_ctx.device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int adm_del_resource(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection;
+
+	for_each_connection(connection, resource) {
+		if (connection->cstate > C_STANDALONE)
+			return ERR_NET_CONFIGURED;
+	}
+	if (!idr_is_empty(&resource->devices))
+		return ERR_RES_IN_USE;
+
+	list_del_rcu(&resource->resources);
+	/* Make sure all threads have actually stopped: state handling only
+	 * does drbd_thread_stop_nowait(). */
+	list_for_each_entry(connection, &resource->connections, connections)
+		drbd_thread_stop(&connection->worker);
+	synchronize_rcu();
+	drbd_free_resource(resource);
+	return NO_ERROR;
+}
+
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+	unsigned i;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	resource = adm_ctx.resource;
+	mutex_lock(&resource->adm_mutex);
+	/* demote */
+	for_each_connection(connection, resource) {
+		struct drbd_peer_device *peer_device;
+
+		idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+			retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
+			if (retcode < SS_SUCCESS) {
+				drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote");
+				goto out;
+			}
+		}
+
+		retcode = conn_try_disconnect(connection, 0);
+		if (retcode < SS_SUCCESS) {
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect");
+			goto out;
+		}
+	}
+
+	/* detach */
+	idr_for_each_entry(&resource->devices, device, i) {
+		retcode = adm_detach(device, 0);
+		if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach");
+			goto out;
+		}
+	}
+
+	/* delete volumes */
+	idr_for_each_entry(&resource->devices, device, i) {
+		retcode = adm_del_minor(device);
+		if (retcode != NO_ERROR) {
+			/* "can not happen" */
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume");
+			goto out;
+		}
+	}
+
+	retcode = adm_del_resource(resource);
+out:
+	mutex_unlock(&resource->adm_mutex);
+finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_resource *resource;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+	resource = adm_ctx.resource;
+
+	mutex_lock(&resource->adm_mutex);
+	retcode = adm_del_resource(resource);
+	mutex_unlock(&resource->adm_mutex);
+finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
+{
+	static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+	struct sk_buff *msg;
+	struct drbd_genlmsghdr *d_out;
+	unsigned seq;
+	int err = -ENOMEM;
+
+	seq = atomic_inc_return(&drbd_genl_seq);
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+	if (!msg)
+		goto failed;
+
+	err = -EMSGSIZE;
+	d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+	if (!d_out) /* cannot happen, but anyways. */
+		goto nla_put_failure;
+	d_out->minor = device_to_minor(device);
+	d_out->ret_code = NO_ERROR;
+
+	if (nla_put_status_info(msg, device, sib))
+		goto nla_put_failure;
+	genlmsg_end(msg, d_out);
+	err = drbd_genl_multicast_events(msg, 0);
+	/* msg has been consumed or freed in netlink_broadcast() */
+	if (err && err != -ESRCH)
+		goto failed;
+
+	return;
+
+nla_put_failure:
+	nlmsg_free(msg);
+failed:
+	drbd_err(device, "Error %d while broadcasting event. "
+			"Event seq:%u sib_reason:%u\n",
+			err, seq, sib->sib_reason);
+}
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 000000000..b2d479149
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+	struct nlattr *head = nla_data(nla);
+	int len = nla_len(nla);
+	int rem;
+
+	/*
+	 * validate_nla (called from nla_parse_nested) ignores attributes
+	 * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+	 * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+	 * flag set also, check and remove that flag before calling
+	 * nla_parse_nested.
+	 */
+
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+			nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+			if (nla_type(nla) > maxtype)
+				return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+			  const struct nla_policy *policy)
+{
+	int err;
+
+	err = drbd_nla_check_mandatory(maxtype, nla);
+	if (!err)
+		err = nla_parse_nested(tb, maxtype, nla, policy);
+
+	return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+	int err;
+	/*
+	 * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+	 * we don't know about that attribute, reject all the nested
+	 * attributes.
+	 */
+	err = drbd_nla_check_mandatory(maxtype, nla);
+	if (err)
+		return ERR_PTR(err);
+	return nla_find_nested(nla, attrtype);
+}
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 000000000..679c2d5b4
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,8 @@
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+				 const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif  /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000..3b10fa6cb
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,368 @@
+/*
+   drbd_proc.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+static int drbd_proc_open(struct inode *inode, struct file *file);
+static int drbd_proc_release(struct inode *inode, struct file *file);
+
+
+struct proc_dir_entry *drbd_proc;
+const struct file_operations drbd_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= drbd_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= drbd_proc_release,
+};
+
+static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
+{
+	/* v is in kB/sec. We don't expect TiByte/sec yet. */
+	if (unlikely(v >= 1000000)) {
+		/* cool: > GiByte/s */
+		seq_printf(seq, "%ld,", v / 1000000);
+		v %= 1000000;
+		seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
+	} else if (likely(v >= 1000))
+		seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
+	else
+		seq_printf(seq, "%ld", v);
+}
+
+static void drbd_get_syncer_progress(struct drbd_device *device,
+		union drbd_dev_state state, unsigned long *rs_total,
+		unsigned long *bits_left, unsigned int *per_mil_done)
+{
+	/* this is to break it at compile time when we change that, in case we
+	 * want to support more than (1<<32) bits on a 32bit arch. */
+	typecheck(unsigned long, device->rs_total);
+	*rs_total = device->rs_total;
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+		*bits_left = device->ov_left;
+	else
+		*bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (*bits_left > *rs_total) {
+		/* D'oh. Maybe a logic bug somewhere.  More likely just a race
+		 * between state change and reset of rs_total.
+		 */
+		*bits_left = *rs_total;
+		*per_mil_done = *rs_total ? 0 : 1000;
+	} else {
+		/* Make sure the division happens in long context.
+		 * We allow up to one petabyte storage right now,
+		 * at a granularity of 4k per bit that is 2**38 bits.
+		 * After shift right and multiplication by 1000,
+		 * this should still fit easily into a 32bit long,
+		 * so we don't need a 64bit division on 32bit arch.
+		 * Note: currently we don't support such large bitmaps on 32bit
+		 * arch anyways, but no harm done to be prepared for it here.
+		 */
+		unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
+		unsigned long left = *bits_left >> shift;
+		unsigned long total = 1UL + (*rs_total >> shift);
+		unsigned long tmp = 1000UL - left * 1000UL/total;
+		*per_mil_done = tmp;
+	}
+}
+
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ *	[=====>..............] 33.5% (23456/123456)
+ *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
+		union drbd_dev_state state)
+{
+	unsigned long db, dt, dbdt, rt, rs_total, rs_left;
+	unsigned int res;
+	int i, x, y;
+	int stalled = 0;
+
+	drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
+
+	x = res/50;
+	y = 20-x;
+	seq_printf(seq, "\t[");
+	for (i = 1; i < x; i++)
+		seq_printf(seq, "=");
+	seq_printf(seq, ">");
+	for (i = 0; i < y; i++)
+		seq_printf(seq, ".");
+	seq_printf(seq, "] ");
+
+	if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+		seq_printf(seq, "verified:");
+	else
+		seq_printf(seq, "sync'ed:");
+	seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
+
+	/* if more than a few GB, display in MB */
+	if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+		seq_printf(seq, "(%lu/%lu)M",
+			    (unsigned long) Bit2KB(rs_left >> 10),
+			    (unsigned long) Bit2KB(rs_total >> 10));
+	else
+		seq_printf(seq, "(%lu/%lu)K",
+			    (unsigned long) Bit2KB(rs_left),
+			    (unsigned long) Bit2KB(rs_total));
+
+	seq_printf(seq, "\n\t");
+
+	/* see drivers/md/md.c
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	/* Rolling marks. last_mark+1 may just now be modified.  last_mark+2 is
+	 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+	 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+	/* ------------------------ ~18s average ------------------------ */
+	i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+	dt = (jiffies - device->rs_mark_time[i]) / HZ;
+	if (dt > 180)
+		stalled = 1;
+
+	if (!dt)
+		dt++;
+	db = device->rs_mark_left[i] - rs_left;
+	rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+	seq_printf(seq, "finish: %lu:%02lu:%02lu",
+		rt / 3600, (rt % 3600) / 60, rt % 60);
+
+	dbdt = Bit2KB(db/dt);
+	seq_printf(seq, " speed: ");
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_printf(seq, " (");
+	/* ------------------------- ~3s average ------------------------ */
+	if (proc_details >= 1) {
+		/* this is what drbd_rs_should_slow_down() uses */
+		i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+		dt = (jiffies - device->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = device->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+		seq_printf_with_thousands_grouping(seq, dbdt);
+		seq_printf(seq, " -- ");
+	}
+
+	/* --------------------- long term average ---------------------- */
+	/* mean speed since syncer started
+	 * we do account for PausedSync periods */
+	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+	if (dt == 0)
+		dt = 1;
+	db = rs_total - rs_left;
+	dbdt = Bit2KB(db/dt);
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_printf(seq, ")");
+
+	if (state.conn == C_SYNC_TARGET ||
+	    state.conn == C_VERIFY_S) {
+		seq_printf(seq, " want: ");
+		seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
+	}
+	seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
+
+	if (proc_details >= 1) {
+		/* 64 bit:
+		 * we convert to sectors in the display below. */
+		unsigned long bm_bits = drbd_bm_bits(device);
+		unsigned long bit_pos;
+		unsigned long long stop_sector = 0;
+		if (state.conn == C_VERIFY_S ||
+		    state.conn == C_VERIFY_T) {
+			bit_pos = bm_bits - device->ov_left;
+			if (verify_can_do_stop_sector(device))
+				stop_sector = device->ov_stop_sector;
+		} else
+			bit_pos = device->bm_resync_fo;
+		/* Total sectors may be slightly off for oddly
+		 * sized devices. So what. */
+		seq_printf(seq,
+			"\t%3d%% sector pos: %llu/%llu",
+			(int)(bit_pos / (bm_bits/100+1)),
+			(unsigned long long)bit_pos * BM_SECT_PER_BIT,
+			(unsigned long long)bm_bits * BM_SECT_PER_BIT);
+		if (stop_sector != 0 && stop_sector != ULLONG_MAX)
+			seq_printf(seq, " stop sector: %llu", stop_sector);
+		seq_printf(seq, "\n");
+	}
+}
+
+static int drbd_seq_show(struct seq_file *seq, void *v)
+{
+	int i, prev_i = -1;
+	const char *sn;
+	struct drbd_device *device;
+	struct net_conf *nc;
+	union drbd_dev_state state;
+	char wp;
+
+	static char write_ordering_chars[] = {
+		[WO_none] = 'n',
+		[WO_drain_io] = 'd',
+		[WO_bdev_flush] = 'f',
+	};
+
+	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+		   API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+	/*
+	  cs .. connection state
+	  ro .. node role (local/remote)
+	  ds .. disk state (local/remote)
+	     protocol
+	     various flags
+	  ns .. network send
+	  nr .. network receive
+	  dw .. disk write
+	  dr .. disk read
+	  al .. activity log write count
+	  bm .. bitmap update write count
+	  pe .. pending (waiting for ack or data reply)
+	  ua .. unack'd (still need to send ack or data reply)
+	  ap .. application requests accepted, but not yet completed
+	  ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+	  wo .. write ordering mode currently in use
+	 oos .. known out-of-sync kB
+	*/
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, device, i) {
+		if (prev_i != i - 1)
+			seq_printf(seq, "\n");
+		prev_i = i;
+
+		state = device->state;
+		sn = drbd_conn_str(state.conn);
+
+		if (state.conn == C_STANDALONE &&
+		    state.disk == D_DISKLESS &&
+		    state.role == R_SECONDARY) {
+			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+		} else {
+			/* reset device->congestion_reason */
+			bdi_rw_congested(&device->rq_queue->backing_dev_info);
+
+			nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+			wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
+			seq_printf(seq,
+			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
+			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+			   i, sn,
+			   drbd_role_str(state.role),
+			   drbd_role_str(state.peer),
+			   drbd_disk_str(state.disk),
+			   drbd_disk_str(state.pdsk),
+			   wp,
+			   drbd_suspended(device) ? 's' : 'r',
+			   state.aftr_isp ? 'a' : '-',
+			   state.peer_isp ? 'p' : '-',
+			   state.user_isp ? 'u' : '-',
+			   device->congestion_reason ?: '-',
+			   test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
+			   device->send_cnt/2,
+			   device->recv_cnt/2,
+			   device->writ_cnt/2,
+			   device->read_cnt/2,
+			   device->al_writ_cnt,
+			   device->bm_writ_cnt,
+			   atomic_read(&device->local_cnt),
+			   atomic_read(&device->ap_pending_cnt) +
+			   atomic_read(&device->rs_pending_cnt),
+			   atomic_read(&device->unacked_cnt),
+			   atomic_read(&device->ap_bio_cnt),
+			   first_peer_device(device)->connection->epochs,
+			   write_ordering_chars[device->resource->write_ordering]
+			);
+			seq_printf(seq, " oos:%llu\n",
+				   Bit2KB((unsigned long long)
+					   drbd_bm_total_weight(device)));
+		}
+		if (state.conn == C_SYNC_SOURCE ||
+		    state.conn == C_SYNC_TARGET ||
+		    state.conn == C_VERIFY_S ||
+		    state.conn == C_VERIFY_T)
+			drbd_syncer_progress(device, seq, state);
+
+		if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
+			lc_seq_printf_stats(seq, device->resync);
+			lc_seq_printf_stats(seq, device->act_log);
+			put_ldev(device);
+		}
+
+		if (proc_details >= 2)
+			seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int drbd_proc_open(struct inode *inode, struct file *file)
+{
+	int err;
+
+	if (try_module_get(THIS_MODULE)) {
+		err = single_open(file, drbd_seq_show, NULL);
+		if (err)
+			module_put(THIS_MODULE);
+		return err;
+	}
+	return -ENODEV;
+}
+
+static int drbd_proc_release(struct inode *inode, struct file *file)
+{
+	module_put(THIS_MODULE);
+	return single_release(inode, file);
+}
+
+/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
new file mode 100644
index 000000000..2da9104a3
--- /dev/null
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -0,0 +1,307 @@
+#ifndef __DRBD_PROTOCOL_H
+#define __DRBD_PROTOCOL_H
+
+enum drbd_packet {
+	/* receiver (data socket) */
+	P_DATA		      = 0x00,
+	P_DATA_REPLY	      = 0x01, /* Response to P_DATA_REQUEST */
+	P_RS_DATA_REPLY	      = 0x02, /* Response to P_RS_DATA_REQUEST */
+	P_BARRIER	      = 0x03,
+	P_BITMAP	      = 0x04,
+	P_BECOME_SYNC_TARGET  = 0x05,
+	P_BECOME_SYNC_SOURCE  = 0x06,
+	P_UNPLUG_REMOTE	      = 0x07, /* Used at various times to hint the peer */
+	P_DATA_REQUEST	      = 0x08, /* Used to ask for a data block */
+	P_RS_DATA_REQUEST     = 0x09, /* Used to ask for a data block for resync */
+	P_SYNC_PARAM	      = 0x0a,
+	P_PROTOCOL	      = 0x0b,
+	P_UUIDS		      = 0x0c,
+	P_SIZES		      = 0x0d,
+	P_STATE		      = 0x0e,
+	P_SYNC_UUID	      = 0x0f,
+	P_AUTH_CHALLENGE      = 0x10,
+	P_AUTH_RESPONSE	      = 0x11,
+	P_STATE_CHG_REQ	      = 0x12,
+
+	/* asender (meta socket */
+	P_PING		      = 0x13,
+	P_PING_ACK	      = 0x14,
+	P_RECV_ACK	      = 0x15, /* Used in protocol B */
+	P_WRITE_ACK	      = 0x16, /* Used in protocol C */
+	P_RS_WRITE_ACK	      = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+	P_SUPERSEDED	      = 0x18, /* Used in proto C, two-primaries conflict detection */
+	P_NEG_ACK	      = 0x19, /* Sent if local disk is unusable */
+	P_NEG_DREPLY	      = 0x1a, /* Local disk is broken... */
+	P_NEG_RS_DREPLY	      = 0x1b, /* Local disk is broken... */
+	P_BARRIER_ACK	      = 0x1c,
+	P_STATE_CHG_REPLY     = 0x1d,
+
+	/* "new" commands, no longer fitting into the ordering scheme above */
+
+	P_OV_REQUEST	      = 0x1e, /* data socket */
+	P_OV_REPLY	      = 0x1f,
+	P_OV_RESULT	      = 0x20, /* meta socket */
+	P_CSUM_RS_REQUEST     = 0x21, /* data socket */
+	P_RS_IS_IN_SYNC	      = 0x22, /* meta socket */
+	P_SYNC_PARAM89	      = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+	P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
+	/* P_CKPT_FENCE_REQ      = 0x25, * currently reserved for protocol D */
+	/* P_CKPT_DISABLE_REQ    = 0x26, * currently reserved for protocol D */
+	P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
+	P_OUT_OF_SYNC         = 0x28, /* Mark as out of sync (Outrunning), data socket */
+	P_RS_CANCEL           = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+	P_CONN_ST_CHG_REQ     = 0x2a, /* data sock: Connection wide state request */
+	P_CONN_ST_CHG_REPLY   = 0x2b, /* meta sock: Connection side state req reply */
+	P_RETRY_WRITE	      = 0x2c, /* Protocol C: retry conflicting write request */
+	P_PROTOCOL_UPDATE     = 0x2d, /* data sock: is used in established connections */
+        /* 0x2e to 0x30 reserved, used in drbd 9 */
+
+	/* REQ_DISCARD. We used "discard" in different contexts before,
+	 * which is why I chose TRIM here, to disambiguate. */
+	P_TRIM                = 0x31,
+
+	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+	P_MAX_OPT_CMD	      = 0x101,
+
+	/* special command ids for handshake */
+
+	P_INITIAL_META	      = 0xfff1, /* First Packet on the MetaSock */
+	P_INITIAL_DATA	      = 0xfff2, /* First Packet on the Socket */
+
+	P_CONNECTION_FEATURES = 0xfffe	/* FIXED for the next century! */
+};
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ *     (except block_id and barrier fields.
+ *	these are pointers to local structs
+ *	and have no relevance for the partner,
+ *	which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header80 {
+	u32	  magic;
+	u16	  command;
+	u16	  length;	/* bytes of data after this header */
+} __packed;
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+	u16	  magic;	/* use DRBD_MAGIC_BIG here */
+	u16	  command;
+	u32	  length;
+} __packed;
+
+struct p_header100 {
+	u32	  magic;
+	u16	  volume;
+	u16	  command;
+	u32	  length;
+	u32	  pad;
+} __packed;
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER	      1 /* depricated */
+#define DP_RW_SYNC	      2 /* equals REQ_SYNC    */
+#define DP_MAY_SET_IN_SYNC    4
+#define DP_UNPLUG             8 /* not used anymore   */
+#define DP_FUA               16 /* equals REQ_FUA     */
+#define DP_FLUSH             32 /* equals REQ_FLUSH   */
+#define DP_DISCARD           64 /* equals REQ_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK   256 /* This is a proto C write request */
+
+struct p_data {
+	u64	    sector;    /* 64 bits sector number */
+	u64	    block_id;  /* to identify the request in protocol B&C */
+	u32	    seq_num;
+	u32	    dp_flags;
+} __packed;
+
+struct p_trim {
+	struct p_data p_data;
+	u32	    size;	/* == bio->bi_size */
+} __packed;
+
+/*
+ * commands which share a struct:
+ *  p_block_ack:
+ *   P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ *   P_SUPERSEDED (proto C, two-primaries conflict detection)
+ *  p_block_req:
+ *   P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+	u64	    sector;
+	u64	    block_id;
+	u32	    blksize;
+	u32	    seq_num;
+} __packed;
+
+struct p_block_req {
+	u64 sector;
+	u64 block_id;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ *   P_CONNECTION_FEATURES
+ *   P_BARRIER
+ *   P_BARRIER_ACK
+ *   P_SYNC_PARAM
+ *   ReportParams
+ */
+
+#define FF_TRIM      1
+
+struct p_connection_features {
+	u32 protocol_min;
+	u32 feature_flags;
+	u32 protocol_max;
+
+	/* should be more than enough for future enhancements
+	 * for now, feature_flags and the reserved array shall be zero.
+	 */
+
+	u32 _pad;
+	u64 reserved[7];
+} __packed;
+
+struct p_barrier {
+	u32 barrier;	/* barrier number _handle_ only */
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+	u32 barrier;
+	u32 set_size;
+} __packed;
+
+struct p_rs_param {
+	u32 resync_rate;
+
+	      /* Since protocol version 88 and higher. */
+	char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+	u32 resync_rate;
+	/* protocol version 89: */
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_rs_param_95 {
+	u32 resync_rate;
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+	u32 c_plan_ahead;
+	u32 c_delay_target;
+	u32 c_fill_target;
+	u32 c_max_rate;
+} __packed;
+
+enum drbd_conn_flags {
+	CF_DISCARD_MY_DATA = 1,
+	CF_DRY_RUN = 2,
+};
+
+struct p_protocol {
+	u32 protocol;
+	u32 after_sb_0p;
+	u32 after_sb_1p;
+	u32 after_sb_2p;
+	u32 conn_flags;
+	u32 two_primaries;
+
+	/* Since protocol version 87 and higher. */
+	char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+	u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+	u64	    uuid;
+} __packed;
+
+struct p_sizes {
+	u64	    d_size;  /* size of disk */
+	u64	    u_size;  /* user requested size */
+	u64	    c_size;  /* current exported size */
+	u32	    max_bio_size;  /* Maximal size of a BIO */
+	u16	    queue_order_type;  /* not yet implemented in DRBD*/
+	u16	    dds_flags; /* use enum dds_flags here. */
+} __packed;
+
+struct p_state {
+	u32	    state;
+} __packed;
+
+struct p_req_state {
+	u32	    mask;
+	u32	    val;
+} __packed;
+
+struct p_req_state_reply {
+	u32	    retcode;
+} __packed;
+
+struct p_drbd06_param {
+	u64	  size;
+	u32	  state;
+	u32	  blksize;
+	u32	  protocol;
+	u32	  version;
+	u32	  gen_cnt[5];
+	u32	  bit_map_gen[5];
+} __packed;
+
+struct p_block_desc {
+	u64 sector;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+	/* RLE_VLI_Bytes = 0,
+	 * and other bit variants had been defined during
+	 * algorithm evaluation. */
+	RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+	/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+	 * (encoding & 0x80): polarity (set/unset) of first runlength
+	 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+	 * used to pad up to head.length bytes
+	 */
+	u8 encoding;
+
+	u8 code[0];
+} __packed;
+
+struct p_delay_probe93 {
+	u32     seq_num; /* sequence number to match the two probe packets */
+	u32     offset;  /* usecs the probe got sent after the reference time point */
+} __packed;
+
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
+ */
+#define DRBD_SOCKET_BUFFER_SIZE 4096
+
+#endif  /* __DRBD_PROTOCOL_H */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000..cee20354a
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,5661 @@
+/*
+   drbd_receiver.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include "drbd_vli.h"
+
+#define PRO_FEATURES (FF_TRIM)
+
+struct packet_info {
+	enum drbd_packet cmd;
+	unsigned int size;
+	unsigned int vnr;
+	void *data;
+};
+
+enum finish_epoch {
+	FE_STILL_LIVE,
+	FE_DESTROYED,
+	FE_RECYCLED,
+};
+
+static int drbd_do_features(struct drbd_connection *connection);
+static int drbd_do_auth(struct drbd_connection *connection);
+static int drbd_disconnected(struct drbd_peer_device *);
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
+
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+	struct page *page;
+	struct page *tmp;
+
+	BUG_ON(!n);
+	BUG_ON(!head);
+
+	page = *head;
+
+	if (!page)
+		return NULL;
+
+	while (page) {
+		tmp = page_chain_next(page);
+		if (--n == 0)
+			break; /* found sufficient pages */
+		if (tmp == NULL)
+			/* insufficient pages, don't use any of them. */
+			return NULL;
+		page = tmp;
+	}
+
+	/* add end of list marker for the returned list */
+	set_page_private(page, 0);
+	/* actual return value, and adjustment of head */
+	page = *head;
+	*head = tmp;
+	return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+	struct page *tmp;
+	int i = 1;
+	while ((tmp = page_chain_next(page)))
+		++i, page = tmp;
+	if (len)
+		*len = i;
+	return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+	struct page *tmp;
+	int i = 0;
+	page_chain_for_each_safe(page, tmp) {
+		put_page(page);
+		++i;
+	}
+	return i;
+}
+
+static void page_chain_add(struct page **head,
+		struct page *chain_first, struct page *chain_last)
+{
+#if 1
+	struct page *tmp;
+	tmp = page_chain_tail(chain_first, NULL);
+	BUG_ON(tmp != chain_last);
+#endif
+
+	/* add chain to head */
+	set_page_private(chain_last, (unsigned long)*head);
+	*head = chain_first;
+}
+
+static struct page *__drbd_alloc_pages(struct drbd_device *device,
+				       unsigned int number)
+{
+	struct page *page = NULL;
+	struct page *tmp = NULL;
+	unsigned int i = 0;
+
+	/* Yes, testing drbd_pp_vacant outside the lock is racy.
+	 * So what. It saves a spin_lock. */
+	if (drbd_pp_vacant >= number) {
+		spin_lock(&drbd_pp_lock);
+		page = page_chain_del(&drbd_pp_pool, number);
+		if (page)
+			drbd_pp_vacant -= number;
+		spin_unlock(&drbd_pp_lock);
+		if (page)
+			return page;
+	}
+
+	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	for (i = 0; i < number; i++) {
+		tmp = alloc_page(GFP_TRY);
+		if (!tmp)
+			break;
+		set_page_private(tmp, (unsigned long)page);
+		page = tmp;
+	}
+
+	if (i == number)
+		return page;
+
+	/* Not enough pages immediately available this time.
+	 * No need to jump around here, drbd_alloc_pages will retry this
+	 * function "soon". */
+	if (page) {
+		tmp = page_chain_tail(page, NULL);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	return NULL;
+}
+
+static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
+					   struct list_head *to_be_freed)
+{
+	struct drbd_peer_request *peer_req, *tmp;
+
+	/* The EEs are always appended to the end of the list. Since
+	   they are sent in order over the wire, they have to finish
+	   in order. As soon as we see the first not finished we can
+	   stop to examine the list... */
+
+	list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
+		if (drbd_peer_req_has_active_page(peer_req))
+			break;
+		list_move(&peer_req->w.list, to_be_freed);
+	}
+}
+
+static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
+{
+	LIST_HEAD(reclaimed);
+	struct drbd_peer_request *peer_req, *t;
+
+	spin_lock_irq(&device->resource->req_lock);
+	reclaim_finished_net_peer_reqs(device, &reclaimed);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+		drbd_free_net_peer_req(device, peer_req);
+}
+
+/**
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
+ * @device:	DRBD device.
+ * @number:	number of pages requested
+ * @retry:	whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
+ *
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
+ * Returns a page chain linked via page->private.
+ */
+struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
+			      bool retry)
+{
+	struct drbd_device *device = peer_device->device;
+	struct page *page = NULL;
+	struct net_conf *nc;
+	DEFINE_WAIT(wait);
+	unsigned int mxb;
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+	mxb = nc ? nc->max_buffers : 1000000;
+	rcu_read_unlock();
+
+	if (atomic_read(&device->pp_in_use) < mxb)
+		page = __drbd_alloc_pages(device, number);
+
+	while (page == NULL) {
+		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+		drbd_kick_lo_and_reclaim_net(device);
+
+		if (atomic_read(&device->pp_in_use) < mxb) {
+			page = __drbd_alloc_pages(device, number);
+			if (page)
+				break;
+		}
+
+		if (!retry)
+			break;
+
+		if (signal_pending(current)) {
+			drbd_warn(device, "drbd_alloc_pages interrupted!\n");
+			break;
+		}
+
+		if (schedule_timeout(HZ/10) == 0)
+			mxb = UINT_MAX;
+	}
+	finish_wait(&drbd_pp_wait, &wait);
+
+	if (page)
+		atomic_add(number, &device->pp_in_use);
+	return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&resource->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
+static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
+{
+	atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
+	int i;
+
+	if (page == NULL)
+		return;
+
+	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
+		i = page_chain_free(page);
+	else {
+		struct page *tmp;
+		tmp = page_chain_tail(page, &i);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	i = atomic_sub_return(i, a);
+	if (i < 0)
+		drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
+			is_net ? "pp_in_use_by_net" : "pp_in_use", i);
+	wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
+ drbd_ee_fix_bhs()
+ drbd_finish_peer_reqs()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+		    unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+	struct page *page = NULL;
+	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+
+	if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
+		return NULL;
+
+	peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	if (!peer_req) {
+		if (!(gfp_mask & __GFP_NOWARN))
+			drbd_err(device, "%s: allocation failed\n", __func__);
+		return NULL;
+	}
+
+	if (has_payload && data_size) {
+		page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
+		if (!page)
+			goto fail;
+	}
+
+	memset(peer_req, 0, sizeof(*peer_req));
+	INIT_LIST_HEAD(&peer_req->w.list);
+	drbd_clear_interval(&peer_req->i);
+	peer_req->i.size = data_size;
+	peer_req->i.sector = sector;
+	peer_req->submit_jif = jiffies;
+	peer_req->peer_device = peer_device;
+	peer_req->pages = page;
+	/*
+	 * The block_id is opaque to the receiver.  It is not endianness
+	 * converted, and sent back to the sender unchanged.
+	 */
+	peer_req->block_id = id;
+
+	return peer_req;
+
+ fail:
+	mempool_free(peer_req, drbd_ee_mempool);
+	return NULL;
+}
+
+void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
+		       int is_net)
+{
+	might_sleep();
+	if (peer_req->flags & EE_HAS_DIGEST)
+		kfree(peer_req->digest);
+	drbd_free_pages(device, peer_req->pages, is_net);
+	D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
+	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+	if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+		drbd_al_complete_io(device, &peer_req->i);
+	}
+	mempool_free(peer_req, drbd_ee_mempool);
+}
+
+int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
+{
+	LIST_HEAD(work_list);
+	struct drbd_peer_request *peer_req, *t;
+	int count = 0;
+	int is_net = list == &device->net_ee;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_splice_init(list, &work_list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		__drbd_free_peer_req(device, peer_req, is_net);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
+ */
+static int drbd_finish_peer_reqs(struct drbd_device *device)
+{
+	LIST_HEAD(work_list);
+	LIST_HEAD(reclaimed);
+	struct drbd_peer_request *peer_req, *t;
+	int err = 0;
+
+	spin_lock_irq(&device->resource->req_lock);
+	reclaim_finished_net_peer_reqs(device, &reclaimed);
+	list_splice_init(&device->done_ee, &work_list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+		drbd_free_net_peer_req(device, peer_req);
+
+	/* possible callbacks here:
+	 * e_end_block, and e_end_resync_block, e_send_superseded.
+	 * all ignore the last argument.
+	 */
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		int err2;
+
+		/* list_del not necessary, next/prev members not touched */
+		err2 = peer_req->w.cb(&peer_req->w, !!err);
+		if (!err)
+			err = err2;
+		drbd_free_peer_req(device, peer_req);
+	}
+	wake_up(&device->ee_wait);
+
+	return err;
+}
+
+static void _drbd_wait_ee_list_empty(struct drbd_device *device,
+				     struct list_head *head)
+{
+	DEFINE_WAIT(wait);
+
+	/* avoids spin_lock/unlock
+	 * and calling prepare_to_wait in the fast path */
+	while (!list_empty(head)) {
+		prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&device->resource->req_lock);
+		io_schedule();
+		finish_wait(&device->ee_wait, &wait);
+		spin_lock_irq(&device->resource->req_lock);
+	}
+}
+
+static void drbd_wait_ee_list_empty(struct drbd_device *device,
+				    struct list_head *head)
+{
+	spin_lock_irq(&device->resource->req_lock);
+	_drbd_wait_ee_list_empty(device, head);
+	spin_unlock_irq(&device->resource->req_lock);
+}
+
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
+{
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+	};
+	return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
+}
+
+static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int rv;
+
+	rv = drbd_recv_short(connection->data.socket, buf, size, 0);
+
+	if (rv < 0) {
+		if (rv == -ECONNRESET)
+			drbd_info(connection, "sock was reset by peer\n");
+		else if (rv != -ERESTARTSYS)
+			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+	} else if (rv == 0) {
+		if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+			long t;
+			rcu_read_lock();
+			t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+			rcu_read_unlock();
+
+			t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
+
+			if (t)
+				goto out;
+		}
+		drbd_info(connection, "sock was shut down by peer\n");
+	}
+
+	if (rv != size)
+		conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+
+out:
+	return rv;
+}
+
+static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv(connection, buf, size);
+	if (err != size) {
+		if (err >= 0)
+			err = -EIO;
+	} else
+		err = 0;
+	return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv_all(connection, buf, size);
+	if (err && !signal_pending(current))
+		drbd_warn(connection, "short read (expected size %d)\n", (int)size);
+	return err;
+}
+
+/* quoting tcp(7):
+ *   On individual connections, the socket buffer size must be set prior to the
+ *   listen(2) or connect(2) calls in order to have it take effect.
+ * This is our wrapper to do so.
+ */
+static void drbd_setbufsize(struct socket *sock, unsigned int snd,
+		unsigned int rcv)
+{
+	/* open coded SO_SNDBUF, SO_RCVBUF */
+	if (snd) {
+		sock->sk->sk_sndbuf = snd;
+		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+	if (rcv) {
+		sock->sk->sk_rcvbuf = rcv;
+		sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+}
+
+static struct socket *drbd_try_connect(struct drbd_connection *connection)
+{
+	const char *what;
+	struct socket *sock;
+	struct sockaddr_in6 src_in6;
+	struct sockaddr_in6 peer_in6;
+	struct net_conf *nc;
+	int err, peer_addr_len, my_addr_len;
+	int sndbuf_size, rcvbuf_size, connect_int;
+	int disconnect_on_error = 1;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	sndbuf_size = nc->sndbuf_size;
+	rcvbuf_size = nc->rcvbuf_size;
+	connect_int = nc->connect_int;
+	rcu_read_unlock();
+
+	my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
+	memcpy(&src_in6, &connection->my_addr, my_addr_len);
+
+	if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
+		src_in6.sin6_port = 0;
+	else
+		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+	peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
+	memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+			       SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		sock = NULL;
+		goto out;
+	}
+
+	sock->sk->sk_rcvtimeo =
+	sock->sk->sk_sndtimeo = connect_int * HZ;
+	drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
+
+       /* explicitly bind to the configured IP as source IP
+	*  for the outgoing connections.
+	*  This is needed for multihomed hosts and to be
+	*  able to use lo: interfaces for drbd.
+	* Make sure to use 0 as port number, so linux selects
+	*  a free one dynamically.
+	*/
+	what = "bind before connect";
+	err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
+	if (err < 0)
+		goto out;
+
+	/* connect may fail, peer not yet available.
+	 * stay C_WF_CONNECTION, don't go Disconnecting! */
+	disconnect_on_error = 0;
+	what = "connect";
+	err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
+
+out:
+	if (err < 0) {
+		if (sock) {
+			sock_release(sock);
+			sock = NULL;
+		}
+		switch (-err) {
+			/* timeout, busy, signal pending */
+		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+		case EINTR: case ERESTARTSYS:
+			/* peer not (yet) available, network problem */
+		case ECONNREFUSED: case ENETUNREACH:
+		case EHOSTDOWN:    case EHOSTUNREACH:
+			disconnect_on_error = 0;
+			break;
+		default:
+			drbd_err(connection, "%s failed, err = %d\n", what, err);
+		}
+		if (disconnect_on_error)
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	}
+
+	return sock;
+}
+
+struct accept_wait_data {
+	struct drbd_connection *connection;
+	struct socket *s_listen;
+	struct completion door_bell;
+	void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
+{
+	struct accept_wait_data *ad = sk->sk_user_data;
+	void (*state_change)(struct sock *sk);
+
+	state_change = ad->original_sk_state_change;
+	if (sk->sk_state == TCP_ESTABLISHED)
+		complete(&ad->door_bell);
+	state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+	int err, sndbuf_size, rcvbuf_size, my_addr_len;
+	struct sockaddr_in6 my_addr;
+	struct socket *s_listen;
+	struct net_conf *nc;
+	const char *what;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -EIO;
+	}
+	sndbuf_size = nc->sndbuf_size;
+	rcvbuf_size = nc->rcvbuf_size;
+	rcu_read_unlock();
+
+	my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
+	memcpy(&my_addr, &connection->my_addr, my_addr_len);
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+			       SOCK_STREAM, IPPROTO_TCP, &s_listen);
+	if (err) {
+		s_listen = NULL;
+		goto out;
+	}
+
+	s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
+
+	what = "bind before listen";
+	err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
+	if (err < 0)
+		goto out;
+
+	ad->s_listen = s_listen;
+	write_lock_bh(&s_listen->sk->sk_callback_lock);
+	ad->original_sk_state_change = s_listen->sk->sk_state_change;
+	s_listen->sk->sk_state_change = drbd_incoming_connection;
+	s_listen->sk->sk_user_data = ad;
+	write_unlock_bh(&s_listen->sk->sk_callback_lock);
+
+	what = "listen";
+	err = s_listen->ops->listen(s_listen, 5);
+	if (err < 0)
+		goto out;
+
+	return 0;
+out:
+	if (s_listen)
+		sock_release(s_listen);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			drbd_err(connection, "%s failed, err = %d\n", what, err);
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	}
+
+	return -EIO;
+}
+
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_state_change = ad->original_sk_state_change;
+	sk->sk_user_data = NULL;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+	int timeo, connect_int, err = 0;
+	struct socket *s_estab = NULL;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	connect_int = nc->connect_int;
+	rcu_read_unlock();
+
+	timeo = connect_int * HZ;
+	/* 28.5% random jitter */
+	timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
+
+	err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+	if (err <= 0)
+		return NULL;
+
+	err = kernel_accept(ad->s_listen, &s_estab, 0);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			drbd_err(connection, "accept failed, err = %d\n", err);
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	}
+
+	if (s_estab)
+		unregister_state_change(s_estab->sk, ad);
+
+	return s_estab;
+}
+
+static int decode_header(struct drbd_connection *, void *, struct packet_info *);
+
+static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
+			     enum drbd_packet cmd)
+{
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, cmd, 0, NULL, 0);
+}
+
+static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
+{
+	unsigned int header_size = drbd_header_size(connection);
+	struct packet_info pi;
+	struct net_conf *nc;
+	int err;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -EIO;
+	}
+	sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
+	rcu_read_unlock();
+
+	err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
+	if (err != header_size) {
+		if (err >= 0)
+			err = -EIO;
+		return err;
+	}
+	err = decode_header(connection, connection->data.rbuf, &pi);
+	if (err)
+		return err;
+	return pi.cmd;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @sock:	pointer to the pointer to the socket.
+ */
+static bool drbd_socket_okay(struct socket **sock)
+{
+	int rr;
+	char tb[4];
+
+	if (!*sock)
+		return false;
+
+	rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+	if (rr > 0 || rr == -EAGAIN) {
+		return true;
+	} else {
+		sock_release(*sock);
+		*sock = NULL;
+		return false;
+	}
+}
+
+static bool connection_established(struct drbd_connection *connection,
+				   struct socket **sock1,
+				   struct socket **sock2)
+{
+	struct net_conf *nc;
+	int timeout;
+	bool ok;
+
+	if (!*sock1 || !*sock2)
+		return false;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
+	rcu_read_unlock();
+	schedule_timeout_interruptible(timeout);
+
+	ok = drbd_socket_okay(sock1);
+	ok = drbd_socket_okay(sock2) && ok;
+
+	return ok;
+}
+
+/* Gets called if a connection is established, or if a new minor gets created
+   in a connection */
+int drbd_connected(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	atomic_set(&device->packet_seq, 0);
+	device->peer_seq = 0;
+
+	device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
+		&peer_device->connection->cstate_mutex :
+		&device->own_state_mutex;
+
+	err = drbd_send_sync_param(peer_device);
+	if (!err)
+		err = drbd_send_sizes(peer_device, 0, 0);
+	if (!err)
+		err = drbd_send_uuids(peer_device);
+	if (!err)
+		err = drbd_send_current_state(peer_device);
+	clear_bit(USE_DEGR_WFC_T, &device->flags);
+	clear_bit(RESIZE_PENDING, &device->flags);
+	atomic_set(&device->ap_in_flight, 0);
+	mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
+	return err;
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ *  -2 We do not have a network config...
+ */
+static int conn_connect(struct drbd_connection *connection)
+{
+	struct drbd_socket sock, msock;
+	struct drbd_peer_device *peer_device;
+	struct net_conf *nc;
+	int vnr, timeout, h;
+	bool discard_my_data, ok;
+	enum drbd_state_rv rv;
+	struct accept_wait_data ad = {
+		.connection = connection,
+		.door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+	};
+
+	clear_bit(DISCONNECT_SENT, &connection->flags);
+	if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
+		return -2;
+
+	mutex_init(&sock.mutex);
+	sock.sbuf = connection->data.sbuf;
+	sock.rbuf = connection->data.rbuf;
+	sock.socket = NULL;
+	mutex_init(&msock.mutex);
+	msock.sbuf = connection->meta.sbuf;
+	msock.rbuf = connection->meta.rbuf;
+	msock.socket = NULL;
+
+	/* Assume that the peer only understands protocol 80 until we know better.  */
+	connection->agreed_pro_version = 80;
+
+	if (prepare_listen_socket(connection, &ad))
+		return 0;
+
+	do {
+		struct socket *s;
+
+		s = drbd_try_connect(connection);
+		if (s) {
+			if (!sock.socket) {
+				sock.socket = s;
+				send_first_packet(connection, &sock, P_INITIAL_DATA);
+			} else if (!msock.socket) {
+				clear_bit(RESOLVE_CONFLICTS, &connection->flags);
+				msock.socket = s;
+				send_first_packet(connection, &msock, P_INITIAL_META);
+			} else {
+				drbd_err(connection, "Logic error in conn_connect()\n");
+				goto out_release_sockets;
+			}
+		}
+
+		if (connection_established(connection, &sock.socket, &msock.socket))
+			break;
+
+retry:
+		s = drbd_wait_for_connect(connection, &ad);
+		if (s) {
+			int fp = receive_first_packet(connection, s);
+			drbd_socket_okay(&sock.socket);
+			drbd_socket_okay(&msock.socket);
+			switch (fp) {
+			case P_INITIAL_DATA:
+				if (sock.socket) {
+					drbd_warn(connection, "initial packet S crossed\n");
+					sock_release(sock.socket);
+					sock.socket = s;
+					goto randomize;
+				}
+				sock.socket = s;
+				break;
+			case P_INITIAL_META:
+				set_bit(RESOLVE_CONFLICTS, &connection->flags);
+				if (msock.socket) {
+					drbd_warn(connection, "initial packet M crossed\n");
+					sock_release(msock.socket);
+					msock.socket = s;
+					goto randomize;
+				}
+				msock.socket = s;
+				break;
+			default:
+				drbd_warn(connection, "Error receiving initial packet\n");
+				sock_release(s);
+randomize:
+				if (prandom_u32() & 1)
+					goto retry;
+			}
+		}
+
+		if (connection->cstate <= C_DISCONNECTING)
+			goto out_release_sockets;
+		if (signal_pending(current)) {
+			flush_signals(current);
+			smp_rmb();
+			if (get_t_state(&connection->receiver) == EXITING)
+				goto out_release_sockets;
+		}
+
+		ok = connection_established(connection, &sock.socket, &msock.socket);
+	} while (!ok);
+
+	if (ad.s_listen)
+		sock_release(ad.s_listen);
+
+	sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+
+	sock.socket->sk->sk_allocation = GFP_NOIO;
+	msock.socket->sk->sk_allocation = GFP_NOIO;
+
+	sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+	msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+	/* NOT YET ...
+	 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
+	 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+	 * first set it to the P_CONNECTION_FEATURES timeout,
+	 * which we set to 4x the configured ping_timeout. */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+
+	sock.socket->sk->sk_sndtimeo =
+	sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
+
+	msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+	timeout = nc->timeout * HZ / 10;
+	discard_my_data = nc->discard_my_data;
+	rcu_read_unlock();
+
+	msock.socket->sk->sk_sndtimeo = timeout;
+
+	/* we don't want delays.
+	 * we use TCP_CORK where appropriate, though */
+	drbd_tcp_nodelay(sock.socket);
+	drbd_tcp_nodelay(msock.socket);
+
+	connection->data.socket = sock.socket;
+	connection->meta.socket = msock.socket;
+	connection->last_received = jiffies;
+
+	h = drbd_do_features(connection);
+	if (h <= 0)
+		return h;
+
+	if (connection->cram_hmac_tfm) {
+		/* drbd_request_state(device, NS(conn, WFAuth)); */
+		switch (drbd_do_auth(connection)) {
+		case -1:
+			drbd_err(connection, "Authentication of peer failed\n");
+			return -1;
+		case 0:
+			drbd_err(connection, "Authentication of peer failed, trying again.\n");
+			return 0;
+		}
+	}
+
+	connection->data.socket->sk->sk_sndtimeo = timeout;
+	connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+	if (drbd_send_protocol(connection) == -EOPNOTSUPP)
+		return -1;
+
+	/* Prevent a race between resync-handshake and
+	 * being promoted to Primary.
+	 *
+	 * Grab and release the state mutex, so we know that any current
+	 * drbd_set_role() is finished, and any incoming drbd_set_role
+	 * will see the STATE_SENT flag, and wait for it to be cleared.
+	 */
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_lock(peer_device->device->state_mutex);
+
+	set_bit(STATE_SENT, &connection->flags);
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_unlock(peer_device->device->state_mutex);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		kref_get(&device->kref);
+		rcu_read_unlock();
+
+		if (discard_my_data)
+			set_bit(DISCARD_MY_DATA, &device->flags);
+		else
+			clear_bit(DISCARD_MY_DATA, &device->flags);
+
+		drbd_connected(peer_device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+	if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
+		clear_bit(STATE_SENT, &connection->flags);
+		return 0;
+	}
+
+	drbd_thread_start(&connection->asender);
+
+	mutex_lock(&connection->resource->conf_update);
+	/* The discard_my_data flag is a single-shot modifier to the next
+	 * connection attempt, the handshake of which is now well underway.
+	 * No need for rcu style copying of the whole struct
+	 * just to clear a single value. */
+	connection->net_conf->discard_my_data = 0;
+	mutex_unlock(&connection->resource->conf_update);
+
+	return h;
+
+out_release_sockets:
+	if (ad.s_listen)
+		sock_release(ad.s_listen);
+	if (sock.socket)
+		sock_release(sock.socket);
+	if (msock.socket)
+		sock_release(msock.socket);
+	return -1;
+}
+
+static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
+{
+	unsigned int header_size = drbd_header_size(connection);
+
+	if (header_size == sizeof(struct p_header100) &&
+	    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+		struct p_header100 *h = header;
+		if (h->pad != 0) {
+			drbd_err(connection, "Header padding is not zero\n");
+			return -EINVAL;
+		}
+		pi->vnr = be16_to_cpu(h->volume);
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be32_to_cpu(h->length);
+	} else if (header_size == sizeof(struct p_header95) &&
+		   *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+		struct p_header95 *h = header;
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be32_to_cpu(h->length);
+		pi->vnr = 0;
+	} else if (header_size == sizeof(struct p_header80) &&
+		   *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+		struct p_header80 *h = header;
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be16_to_cpu(h->length);
+		pi->vnr = 0;
+	} else {
+		drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
+			 be32_to_cpu(*(__be32 *)header),
+			 connection->agreed_pro_version);
+		return -EINVAL;
+	}
+	pi->data = header + header_size;
+	return 0;
+}
+
+static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
+{
+	void *buffer = connection->data.rbuf;
+	int err;
+
+	err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
+	if (err)
+		return err;
+
+	err = decode_header(connection, buffer, pi);
+	connection->last_received = jiffies;
+
+	return err;
+}
+
+static void drbd_flush(struct drbd_connection *connection)
+{
+	int rv;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	if (connection->resource->write_ordering >= WO_bdev_flush) {
+		rcu_read_lock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+
+			if (!get_ldev(device))
+				continue;
+			kref_get(&device->kref);
+			rcu_read_unlock();
+
+			/* Right now, we have only this one synchronous code path
+			 * for flushes between request epochs.
+			 * We may want to make those asynchronous,
+			 * or at least parallelize the flushes to the volume devices.
+			 */
+			device->flush_jif = jiffies;
+			set_bit(FLUSH_PENDING, &device->flags);
+			rv = blkdev_issue_flush(device->ldev->backing_bdev,
+					GFP_NOIO, NULL);
+			clear_bit(FLUSH_PENDING, &device->flags);
+			if (rv) {
+				drbd_info(device, "local disk flush failed with status %d\n", rv);
+				/* would rather check on EOPNOTSUPP, but that is not reliable.
+				 * don't try again for ANY return value != 0
+				 * if (rv == -EOPNOTSUPP) */
+				drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
+			}
+			put_ldev(device);
+			kref_put(&device->kref, drbd_destroy_device);
+
+			rcu_read_lock();
+			if (rv)
+				break;
+		}
+		rcu_read_unlock();
+	}
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @device:	DRBD device.
+ * @epoch:	Epoch object.
+ * @ev:		Epoch event.
+ */
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
+					       struct drbd_epoch *epoch,
+					       enum epoch_event ev)
+{
+	int epoch_size;
+	struct drbd_epoch *next_epoch;
+	enum finish_epoch rv = FE_STILL_LIVE;
+
+	spin_lock(&connection->epoch_lock);
+	do {
+		next_epoch = NULL;
+
+		epoch_size = atomic_read(&epoch->epoch_size);
+
+		switch (ev & ~EV_CLEANUP) {
+		case EV_PUT:
+			atomic_dec(&epoch->active);
+			break;
+		case EV_GOT_BARRIER_NR:
+			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+			break;
+		case EV_BECAME_LAST:
+			/* nothing to do*/
+			break;
+		}
+
+		if (epoch_size != 0 &&
+		    atomic_read(&epoch->active) == 0 &&
+		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
+			if (!(ev & EV_CLEANUP)) {
+				spin_unlock(&connection->epoch_lock);
+				drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
+				spin_lock(&connection->epoch_lock);
+			}
+#if 0
+			/* FIXME: dec unacked on connection, once we have
+			 * something to count pending connection packets in. */
+			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+				dec_unacked(epoch->connection);
+#endif
+
+			if (connection->current_epoch != epoch) {
+				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+				list_del(&epoch->list);
+				ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+				connection->epochs--;
+				kfree(epoch);
+
+				if (rv == FE_STILL_LIVE)
+					rv = FE_DESTROYED;
+			} else {
+				epoch->flags = 0;
+				atomic_set(&epoch->epoch_size, 0);
+				/* atomic_set(&epoch->active, 0); is already zero */
+				if (rv == FE_STILL_LIVE)
+					rv = FE_RECYCLED;
+			}
+		}
+
+		if (!next_epoch)
+			break;
+
+		epoch = next_epoch;
+	} while (1);
+
+	spin_unlock(&connection->epoch_lock);
+
+	return rv;
+}
+
+static enum write_ordering_e
+max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
+{
+	struct disk_conf *dc;
+
+	dc = rcu_dereference(bdev->disk_conf);
+
+	if (wo == WO_bdev_flush && !dc->disk_flushes)
+		wo = WO_drain_io;
+	if (wo == WO_drain_io && !dc->disk_drain)
+		wo = WO_none;
+
+	return wo;
+}
+
+/**
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @connection:	DRBD connection.
+ * @wo:		Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+			      enum write_ordering_e wo)
+{
+	struct drbd_device *device;
+	enum write_ordering_e pwo;
+	int vnr;
+	static char *write_ordering_str[] = {
+		[WO_none] = "none",
+		[WO_drain_io] = "drain",
+		[WO_bdev_flush] = "flush",
+	};
+
+	pwo = resource->write_ordering;
+	if (wo != WO_bdev_flush)
+		wo = min(pwo, wo);
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (get_ldev(device)) {
+			wo = max_allowed_wo(device->ldev, wo);
+			if (device->ldev == bdev)
+				bdev = NULL;
+			put_ldev(device);
+		}
+	}
+
+	if (bdev)
+		wo = max_allowed_wo(bdev, wo);
+
+	rcu_read_unlock();
+
+	resource->write_ordering = wo;
+	if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
+}
+
+/**
+ * drbd_submit_peer_request()
+ * @device:	DRBD device.
+ * @peer_req:	peer request
+ * @rw:		flag field, see bio->bi_rw
+ *
+ * May spread the pages to multiple bios,
+ * depending on bio_add_page restrictions.
+ *
+ * Returns 0 if all bios have been submitted,
+ * -ENOMEM if we could not allocate enough bios,
+ * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
+ *  single page to an empty bio (which should never happen and likely indicates
+ *  that the lower level IO stack is in some way broken). This has been observed
+ *  on certain Xen deployments.
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_peer_request(struct drbd_device *device,
+			     struct drbd_peer_request *peer_req,
+			     const unsigned rw, const int fault_type)
+{
+	struct bio *bios = NULL;
+	struct bio *bio;
+	struct page *page = peer_req->pages;
+	sector_t sector = peer_req->i.sector;
+	unsigned data_size = peer_req->i.size;
+	unsigned n_bios = 0;
+	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+	int err = -ENOMEM;
+
+	if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+		/* wait for all pending IO completions, before we start
+		 * zeroing things out. */
+		conn_wait_active_ee_empty(first_peer_device(device)->connection);
+		/* add it to the active list now,
+		 * so we can find it to present it in debugfs */
+		peer_req->submit_jif = jiffies;
+		peer_req->flags |= EE_SUBMITTED;
+		spin_lock_irq(&device->resource->req_lock);
+		list_add_tail(&peer_req->w.list, &device->active_ee);
+		spin_unlock_irq(&device->resource->req_lock);
+		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+			sector, data_size >> 9, GFP_NOIO, false))
+			peer_req->flags |= EE_WAS_ERROR;
+		drbd_endio_write_sec_final(peer_req);
+		return 0;
+	}
+
+	/* Discards don't have any payload.
+	 * But the scsi layer still expects a bio_vec it can use internally,
+	 * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
+	if (peer_req->flags & EE_IS_TRIM)
+		nr_pages = 1;
+
+	/* In most cases, we will only need one bio.  But in case the lower
+	 * level restrictions happen to be different at this offset on this
+	 * side than those of the sending peer, we may need to submit the
+	 * request in more than one bio.
+	 *
+	 * Plain bio_alloc is good enough here, this is no DRBD internally
+	 * generated bio, but a bio allocated on behalf of the peer.
+	 */
+next_bio:
+	bio = bio_alloc(GFP_NOIO, nr_pages);
+	if (!bio) {
+		drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
+		goto fail;
+	}
+	/* > peer_req->i.sector, unless this is the first bio */
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_bdev = device->ldev->backing_bdev;
+	bio->bi_rw = rw;
+	bio->bi_private = peer_req;
+	bio->bi_end_io = drbd_peer_request_endio;
+
+	bio->bi_next = bios;
+	bios = bio;
+	++n_bios;
+
+	if (rw & REQ_DISCARD) {
+		bio->bi_iter.bi_size = data_size;
+		goto submit;
+	}
+
+	page_chain_for_each(page) {
+		unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
+		if (!bio_add_page(bio, page, len, 0)) {
+			/* A single page must always be possible!
+			 * But in case it fails anyways,
+			 * we deal with it, and complain (below). */
+			if (bio->bi_vcnt == 0) {
+				drbd_err(device,
+					"bio_add_page failed for len=%u, "
+					"bi_vcnt=0 (bi_sector=%llu)\n",
+					len, (uint64_t)bio->bi_iter.bi_sector);
+				err = -ENOSPC;
+				goto fail;
+			}
+			goto next_bio;
+		}
+		data_size -= len;
+		sector += len >> 9;
+		--nr_pages;
+	}
+	D_ASSERT(device, data_size == 0);
+submit:
+	D_ASSERT(device, page == NULL);
+
+	atomic_set(&peer_req->pending_bios, n_bios);
+	/* for debugfs: update timestamp, mark as submitted */
+	peer_req->submit_jif = jiffies;
+	peer_req->flags |= EE_SUBMITTED;
+	do {
+		bio = bios;
+		bios = bios->bi_next;
+		bio->bi_next = NULL;
+
+		drbd_generic_make_request(device, fault_type, bio);
+	} while (bios);
+	return 0;
+
+fail:
+	while (bios) {
+		bio = bios;
+		bios = bios->bi_next;
+		bio_put(bio);
+	}
+	return err;
+}
+
+static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
+					     struct drbd_peer_request *peer_req)
+{
+	struct drbd_interval *i = &peer_req->i;
+
+	drbd_remove_interval(&device->write_requests, i);
+	drbd_clear_interval(i);
+
+	/* Wake up any processes waiting for this peer request to complete.  */
+	if (i->waiting)
+		wake_up(&device->misc_wait);
+}
+
+static void conn_wait_active_ee_empty(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_wait_ee_list_empty(device, &device->active_ee);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+static struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+	return idr_find(&connection->peer_devices, volume_number);
+}
+
+static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
+{
+	int rv;
+	struct p_barrier *p = pi->data;
+	struct drbd_epoch *epoch;
+
+	/* FIXME these are unacked on connection,
+	 * not a specific (peer)device.
+	 */
+	connection->current_epoch->barrier_nr = p->barrier;
+	connection->current_epoch->connection = connection;
+	rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
+
+	/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+	 * the activity log, which means it would not be resynced in case the
+	 * R_PRIMARY crashes now.
+	 * Therefore we must send the barrier_ack after the barrier request was
+	 * completed. */
+	switch (connection->resource->write_ordering) {
+	case WO_none:
+		if (rv == FE_RECYCLED)
+			return 0;
+
+		/* receiver context, in the writeout path of the other node.
+		 * avoid potential distributed deadlock */
+		epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+		if (epoch)
+			break;
+		else
+			drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
+			/* Fall through */
+
+	case WO_bdev_flush:
+	case WO_drain_io:
+		conn_wait_active_ee_empty(connection);
+		drbd_flush(connection);
+
+		if (atomic_read(&connection->current_epoch->epoch_size)) {
+			epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+			if (epoch)
+				break;
+		}
+
+		return 0;
+	default:
+		drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
+			 connection->resource->write_ordering);
+		return -EIO;
+	}
+
+	epoch->flags = 0;
+	atomic_set(&epoch->epoch_size, 0);
+	atomic_set(&epoch->active, 0);
+
+	spin_lock(&connection->epoch_lock);
+	if (atomic_read(&connection->current_epoch->epoch_size)) {
+		list_add(&epoch->list, &connection->current_epoch->list);
+		connection->current_epoch = epoch;
+		connection->epochs++;
+	} else {
+		/* The current_epoch got recycled while we allocated this one... */
+		kfree(epoch);
+	}
+	spin_unlock(&connection->epoch_lock);
+
+	return 0;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data */
+static struct drbd_peer_request *
+read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+	      struct packet_info *pi) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	const sector_t capacity = drbd_get_capacity(device->this_bdev);
+	struct drbd_peer_request *peer_req;
+	struct page *page;
+	int digest_size, err;
+	unsigned int data_size = pi->size, ds;
+	void *dig_in = peer_device->connection->int_dig_in;
+	void *dig_vv = peer_device->connection->int_dig_vv;
+	unsigned long *data;
+	struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
+
+	digest_size = 0;
+	if (!trim && peer_device->connection->peer_integrity_tfm) {
+		digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+		/*
+		 * FIXME: Receive the incoming digest into the receive buffer
+		 *	  here, together with its struct p_data?
+		 */
+		err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+		if (err)
+			return NULL;
+		data_size -= digest_size;
+	}
+
+	if (trim) {
+		D_ASSERT(peer_device, data_size == 0);
+		data_size = be32_to_cpu(trim->size);
+	}
+
+	if (!expect(IS_ALIGNED(data_size, 512)))
+		return NULL;
+	/* prepare for larger trim requests. */
+	if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
+		return NULL;
+
+	/* even though we trust out peer,
+	 * we sometimes have to double check. */
+	if (sector + (data_size>>9) > capacity) {
+		drbd_err(device, "request from peer beyond end of local disk: "
+			"capacity: %llus < sector: %llus + size: %u\n",
+			(unsigned long long)capacity,
+			(unsigned long long)sector, data_size);
+		return NULL;
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
+	if (!peer_req)
+		return NULL;
+
+	peer_req->flags |= EE_WRITE;
+	if (trim)
+		return peer_req;
+
+	ds = data_size;
+	page = peer_req->pages;
+	page_chain_for_each(page) {
+		unsigned len = min_t(int, ds, PAGE_SIZE);
+		data = kmap(page);
+		err = drbd_recv_all_warn(peer_device->connection, data, len);
+		if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
+			drbd_err(device, "Fault injection: Corrupting data on receive\n");
+			data[0] = data[0] ^ (unsigned long)-1;
+		}
+		kunmap(page);
+		if (err) {
+			drbd_free_peer_req(device, peer_req);
+			return NULL;
+		}
+		ds -= len;
+	}
+
+	if (digest_size) {
+		drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
+		if (memcmp(dig_in, dig_vv, digest_size)) {
+			drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
+				(unsigned long long)sector, data_size);
+			drbd_free_peer_req(device, peer_req);
+			return NULL;
+		}
+	}
+	device->recv_cnt += data_size >> 9;
+	return peer_req;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
+{
+	struct page *page;
+	int err = 0;
+	void *data;
+
+	if (!data_size)
+		return 0;
+
+	page = drbd_alloc_pages(peer_device, 1, 1);
+
+	data = kmap(page);
+	while (data_size) {
+		unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+		err = drbd_recv_all_warn(peer_device->connection, data, len);
+		if (err)
+			break;
+		data_size -= len;
+	}
+	kunmap(page);
+	drbd_free_pages(peer_device->device, page, 0);
+	return err;
+}
+
+static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
+			   sector_t sector, int data_size)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	struct bio *bio;
+	int digest_size, err, expect;
+	void *dig_in = peer_device->connection->int_dig_in;
+	void *dig_vv = peer_device->connection->int_dig_vv;
+
+	digest_size = 0;
+	if (peer_device->connection->peer_integrity_tfm) {
+		digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+		err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+		if (err)
+			return err;
+		data_size -= digest_size;
+	}
+
+	/* optimistically update recv_cnt.  if receiving fails below,
+	 * we disconnect anyways, and counters will be reset. */
+	peer_device->device->recv_cnt += data_size>>9;
+
+	bio = req->master_bio;
+	D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
+
+	bio_for_each_segment(bvec, bio, iter) {
+		void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
+		expect = min_t(int, data_size, bvec.bv_len);
+		err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
+		kunmap(bvec.bv_page);
+		if (err)
+			return err;
+		data_size -= expect;
+	}
+
+	if (digest_size) {
+		drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
+		if (memcmp(dig_in, dig_vv, digest_size)) {
+			drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
+			return -EINVAL;
+		}
+	}
+
+	D_ASSERT(peer_device->device, data_size == 0);
+	return 0;
+}
+
+/*
+ * e_end_resync_block() is called in asender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	int err;
+
+	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		drbd_set_in_sync(device, sector, peer_req->i.size);
+		err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
+	} else {
+		/* Record failure to sync */
+		drbd_rs_failed_io(device, sector, peer_req->i.size);
+
+		err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+	}
+	dec_unacked(device);
+
+	return err;
+}
+
+static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
+			    struct packet_info *pi) __releases(local)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+
+	peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
+	if (!peer_req)
+		goto fail;
+
+	dec_rs_pending(device);
+
+	inc_unacked(device);
+	/* corresponding dec_unacked() in e_end_resync_block()
+	 * respective _drbd_clear_done_ee */
+
+	peer_req->w.cb = e_end_resync_block;
+	peer_req->submit_jif = jiffies;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->sync_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	atomic_add(pi->size >> 9, &device->rs_sect_ev);
+	if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	drbd_free_peer_req(device, peer_req);
+fail:
+	put_ldev(device);
+	return -EIO;
+}
+
+static struct drbd_request *
+find_request(struct drbd_device *device, struct rb_root *root, u64 id,
+	     sector_t sector, bool missing_ok, const char *func)
+{
+	struct drbd_request *req;
+
+	/* Request object according to our peer */
+	req = (struct drbd_request *)(unsigned long)id;
+	if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+		return req;
+	if (!missing_ok) {
+		drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
+			(unsigned long)id, (unsigned long long)sector);
+	}
+	return NULL;
+}
+
+static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct drbd_request *req;
+	sector_t sector;
+	int err;
+	struct p_data *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+
+	spin_lock_irq(&device->resource->req_lock);
+	req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (unlikely(!req))
+		return -EIO;
+
+	/* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
+	 * special casing it there for the various failure cases.
+	 * still no race with drbd_fail_pending_reads */
+	err = recv_dless_read(peer_device, req, sector, pi->size);
+	if (!err)
+		req_mod(req, DATA_RECEIVED);
+	/* else: nothing. handled from drbd_disconnect...
+	 * I don't think we may complete this just yet
+	 * in case we are "on-disconnect: freeze" */
+
+	return err;
+}
+
+static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	int err;
+	struct p_data *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	D_ASSERT(device, p->block_id == ID_SYNCER);
+
+	if (get_ldev(device)) {
+		/* data is submitted to disk within recv_resync_read.
+		 * corresponding put_ldev done below on error,
+		 * or in drbd_peer_request_endio. */
+		err = recv_resync_read(peer_device, sector, pi);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Can not write resync data to local disk.\n");
+
+		err = drbd_drain_block(peer_device, pi->size);
+
+		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+	}
+
+	atomic_add(pi->size >> 9, &device->rs_sect_in);
+
+	return err;
+}
+
+static void restart_conflicting_writes(struct drbd_device *device,
+				       sector_t sector, int size)
+{
+	struct drbd_interval *i;
+	struct drbd_request *req;
+
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		if (!i->local)
+			continue;
+		req = container_of(i, struct drbd_request, i);
+		if (req->rq_state & RQ_LOCAL_PENDING ||
+		    !(req->rq_state & RQ_POSTPONED))
+			continue;
+		/* as it is RQ_POSTPONED, this will cause it to
+		 * be queued on the retry workqueue. */
+		__req_mod(req, CONFLICT_RESOLVED, NULL);
+	}
+}
+
+/*
+ * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	int err = 0, pcmd;
+
+	if (peer_req->flags & EE_SEND_WRITE_ACK) {
+		if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+			pcmd = (device->state.conn >= C_SYNC_SOURCE &&
+				device->state.conn <= C_PAUSED_SYNC_T &&
+				peer_req->flags & EE_MAY_SET_IN_SYNC) ?
+				P_RS_WRITE_ACK : P_WRITE_ACK;
+			err = drbd_send_ack(peer_device, pcmd, peer_req);
+			if (pcmd == P_RS_WRITE_ACK)
+				drbd_set_in_sync(device, sector, peer_req->i.size);
+		} else {
+			err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+			/* we expect it to be marked out of sync anyways...
+			 * maybe assert this?  */
+		}
+		dec_unacked(device);
+	}
+
+	/* we delete from the conflict detection hash _after_ we sent out the
+	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
+	if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+		spin_lock_irq(&device->resource->req_lock);
+		D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
+		drbd_remove_epoch_entry_interval(device, peer_req);
+		if (peer_req->flags & EE_RESTART_REQUESTS)
+			restart_conflicting_writes(device, sector, peer_req->i.size);
+		spin_unlock_irq(&device->resource->req_lock);
+	} else
+		D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+	drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+	return err;
+}
+
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	int err;
+
+	err = drbd_send_ack(peer_device, ack, peer_req);
+	dec_unacked(peer_device->device);
+
+	return err;
+}
+
+static int e_send_superseded(struct drbd_work *w, int unused)
+{
+	return e_send_ack(w, P_SUPERSEDED);
+}
+
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_connection *connection = peer_req->peer_device->connection;
+
+	return e_send_ack(w, connection->agreed_pro_version >= 100 ?
+			     P_RETRY_WRITE : P_SUPERSEDED);
+}
+
+static bool seq_greater(u32 a, u32 b)
+{
+	/*
+	 * We assume 32-bit wrap-around here.
+	 * For 24-bit wrap-around, we would have to shift:
+	 *  a <<= 8; b <<= 8;
+	 */
+	return (s32)a - (s32)b > 0;
+}
+
+static u32 seq_max(u32 a, u32 b)
+{
+	return seq_greater(a, b) ? a : b;
+}
+
+static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
+{
+	struct drbd_device *device = peer_device->device;
+	unsigned int newest_peer_seq;
+
+	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
+		spin_lock(&device->peer_seq_lock);
+		newest_peer_seq = seq_max(device->peer_seq, peer_seq);
+		device->peer_seq = newest_peer_seq;
+		spin_unlock(&device->peer_seq_lock);
+		/* wake up only if we actually changed device->peer_seq */
+		if (peer_seq == newest_peer_seq)
+			wake_up(&device->seq_wait);
+	}
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_request *rs_req;
+	bool rv = 0;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_for_each_entry(rs_req, &device->sync_ee, w.list) {
+		if (overlaps(peer_req->i.sector, peer_req->i.size,
+			     rs_req->i.sector, rs_req->i.size)) {
+			rv = 1;
+			break;
+		}
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+
+	return rv;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than device->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update device->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
+{
+	struct drbd_device *device = peer_device->device;
+	DEFINE_WAIT(wait);
+	long timeout;
+	int ret = 0, tp;
+
+	if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
+		return 0;
+
+	spin_lock(&device->peer_seq_lock);
+	for (;;) {
+		if (!seq_greater(peer_seq - 1, device->peer_seq)) {
+			device->peer_seq = seq_max(device->peer_seq, peer_seq);
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		rcu_read_lock();
+		tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+		rcu_read_unlock();
+
+		if (!tp)
+			break;
+
+		/* Only need to wait if two_primaries is enabled */
+		prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock(&device->peer_seq_lock);
+		rcu_read_lock();
+		timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
+		rcu_read_unlock();
+		timeout = schedule_timeout(timeout);
+		spin_lock(&device->peer_seq_lock);
+		if (!timeout) {
+			ret = -ETIMEDOUT;
+			drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
+			break;
+		}
+	}
+	spin_unlock(&device->peer_seq_lock);
+	finish_wait(&device->seq_wait, &wait);
+	return ret;
+}
+
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static unsigned long wire_flags_to_bio(u32 dpf)
+{
+	return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+		(dpf & DP_FUA ? REQ_FUA : 0) |
+		(dpf & DP_FLUSH ? REQ_FLUSH : 0) |
+		(dpf & DP_DISCARD ? REQ_DISCARD : 0);
+}
+
+static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
+				    unsigned int size)
+{
+	struct drbd_interval *i;
+
+    repeat:
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		struct drbd_request *req;
+		struct bio_and_error m;
+
+		if (!i->local)
+			continue;
+		req = container_of(i, struct drbd_request, i);
+		if (!(req->rq_state & RQ_POSTPONED))
+			continue;
+		req->rq_state &= ~RQ_POSTPONED;
+		__req_mod(req, NEG_ACKED, &m);
+		spin_unlock_irq(&device->resource->req_lock);
+		if (m.bio)
+			complete_master_bio(device, &m);
+		spin_lock_irq(&device->resource->req_lock);
+		goto repeat;
+	}
+}
+
+static int handle_write_conflicts(struct drbd_device *device,
+				  struct drbd_peer_request *peer_req)
+{
+	struct drbd_connection *connection = peer_req->peer_device->connection;
+	bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+	sector_t sector = peer_req->i.sector;
+	const unsigned int size = peer_req->i.size;
+	struct drbd_interval *i;
+	bool equal;
+	int err;
+
+	/*
+	 * Inserting the peer request into the write_requests tree will prevent
+	 * new conflicting local requests from being added.
+	 */
+	drbd_insert_interval(&device->write_requests, &peer_req->i);
+
+    repeat:
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		if (i == &peer_req->i)
+			continue;
+		if (i->completed)
+			continue;
+
+		if (!i->local) {
+			/*
+			 * Our peer has sent a conflicting remote request; this
+			 * should not happen in a two-node setup.  Wait for the
+			 * earlier peer request to complete.
+			 */
+			err = drbd_wait_misc(device, i);
+			if (err)
+				goto out;
+			goto repeat;
+		}
+
+		equal = i->sector == sector && i->size == size;
+		if (resolve_conflicts) {
+			/*
+			 * If the peer request is fully contained within the
+			 * overlapping request, it can be considered overwritten
+			 * and thus superseded; otherwise, it will be retried
+			 * once all overlapping requests have completed.
+			 */
+			bool superseded = i->sector <= sector && i->sector +
+				       (i->size >> 9) >= sector + (size >> 9);
+
+			if (!equal)
+				drbd_alert(device, "Concurrent writes detected: "
+					       "local=%llus +%u, remote=%llus +%u, "
+					       "assuming %s came first\n",
+					  (unsigned long long)i->sector, i->size,
+					  (unsigned long long)sector, size,
+					  superseded ? "local" : "remote");
+
+			peer_req->w.cb = superseded ? e_send_superseded :
+						   e_send_retry_write;
+			list_add_tail(&peer_req->w.list, &device->done_ee);
+			wake_asender(connection);
+
+			err = -ENOENT;
+			goto out;
+		} else {
+			struct drbd_request *req =
+				container_of(i, struct drbd_request, i);
+
+			if (!equal)
+				drbd_alert(device, "Concurrent writes detected: "
+					       "local=%llus +%u, remote=%llus +%u\n",
+					  (unsigned long long)i->sector, i->size,
+					  (unsigned long long)sector, size);
+
+			if (req->rq_state & RQ_LOCAL_PENDING ||
+			    !(req->rq_state & RQ_POSTPONED)) {
+				/*
+				 * Wait for the node with the discard flag to
+				 * decide if this request has been superseded
+				 * or needs to be retried.
+				 * Requests that have been superseded will
+				 * disappear from the write_requests tree.
+				 *
+				 * In addition, wait for the conflicting
+				 * request to finish locally before submitting
+				 * the conflicting peer request.
+				 */
+				err = drbd_wait_misc(device, &req->i);
+				if (err) {
+					_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+					fail_postponed_requests(device, sector, size);
+					goto out;
+				}
+				goto repeat;
+			}
+			/*
+			 * Remember to restart the conflicting requests after
+			 * the new peer request has completed.
+			 */
+			peer_req->flags |= EE_RESTART_REQUESTS;
+		}
+	}
+	err = 0;
+
+    out:
+	if (err)
+		drbd_remove_epoch_entry_interval(device, peer_req);
+	return err;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct net_conf *nc;
+	sector_t sector;
+	struct drbd_peer_request *peer_req;
+	struct p_data *p = pi->data;
+	u32 peer_seq = be32_to_cpu(p->seq_num);
+	int rw = WRITE;
+	u32 dp_flags;
+	int err, tp;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	if (!get_ldev(device)) {
+		int err2;
+
+		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+		atomic_inc(&connection->current_epoch->epoch_size);
+		err2 = drbd_drain_block(peer_device, pi->size);
+		if (!err)
+			err = err2;
+		return err;
+	}
+
+	/*
+	 * Corresponding put_ldev done either below (on various errors), or in
+	 * drbd_peer_request_endio, if we successfully submit the data at the
+	 * end of this function.
+	 */
+
+	sector = be64_to_cpu(p->sector);
+	peer_req = read_in_block(peer_device, p->block_id, sector, pi);
+	if (!peer_req) {
+		put_ldev(device);
+		return -EIO;
+	}
+
+	peer_req->w.cb = e_end_block;
+	peer_req->submit_jif = jiffies;
+	peer_req->flags |= EE_APPLICATION;
+
+	dp_flags = be32_to_cpu(p->dp_flags);
+	rw |= wire_flags_to_bio(dp_flags);
+	if (pi->cmd == P_TRIM) {
+		struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+		peer_req->flags |= EE_IS_TRIM;
+		if (!blk_queue_discard(q))
+			peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+		D_ASSERT(peer_device, peer_req->i.size > 0);
+		D_ASSERT(peer_device, rw & REQ_DISCARD);
+		D_ASSERT(peer_device, peer_req->pages == NULL);
+	} else if (peer_req->pages == NULL) {
+		D_ASSERT(device, peer_req->i.size == 0);
+		D_ASSERT(device, dp_flags & DP_FLUSH);
+	}
+
+	if (dp_flags & DP_MAY_SET_IN_SYNC)
+		peer_req->flags |= EE_MAY_SET_IN_SYNC;
+
+	spin_lock(&connection->epoch_lock);
+	peer_req->epoch = connection->current_epoch;
+	atomic_inc(&peer_req->epoch->epoch_size);
+	atomic_inc(&peer_req->epoch->active);
+	spin_unlock(&connection->epoch_lock);
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+	tp = nc->two_primaries;
+	if (peer_device->connection->agreed_pro_version < 100) {
+		switch (nc->wire_protocol) {
+		case DRBD_PROT_C:
+			dp_flags |= DP_SEND_WRITE_ACK;
+			break;
+		case DRBD_PROT_B:
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (dp_flags & DP_SEND_WRITE_ACK) {
+		peer_req->flags |= EE_SEND_WRITE_ACK;
+		inc_unacked(device);
+		/* corresponding dec_unacked() in e_end_block()
+		 * respective _drbd_clear_done_ee */
+	}
+
+	if (dp_flags & DP_SEND_RECEIVE_ACK) {
+		/* I really don't like it that the receiver thread
+		 * sends on the msock, but anyways */
+		drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+	}
+
+	if (tp) {
+		/* two primaries implies protocol C */
+		D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
+		peer_req->flags |= EE_IN_INTERVAL_TREE;
+		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+		if (err)
+			goto out_interrupted;
+		spin_lock_irq(&device->resource->req_lock);
+		err = handle_write_conflicts(device, peer_req);
+		if (err) {
+			spin_unlock_irq(&device->resource->req_lock);
+			if (err == -ENOENT) {
+				put_ldev(device);
+				return 0;
+			}
+			goto out_interrupted;
+		}
+	} else {
+		update_peer_seq(peer_device, peer_seq);
+		spin_lock_irq(&device->resource->req_lock);
+	}
+	/* if we use the zeroout fallback code, we process synchronously
+	 * and we wait for all pending requests, respectively wait for
+	 * active_ee to become empty in drbd_submit_peer_request();
+	 * better not add ourselves here. */
+	if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+		list_add_tail(&peer_req->w.list, &device->active_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (device->state.conn == C_SYNC_TARGET)
+		wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
+
+	if (device->state.pdsk < D_INCONSISTENT) {
+		/* In case we have the only disk of the cluster, */
+		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
+		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+		drbd_al_begin_io(device, &peer_req->i);
+		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+	}
+
+	err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
+	if (!err)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	drbd_remove_epoch_entry_interval(device, peer_req);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+		drbd_al_complete_io(device, &peer_req->i);
+	}
+
+out_interrupted:
+	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
+	put_ldev(device);
+	drbd_free_peer_req(device, peer_req);
+	return err;
+}
+
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+		bool throttle_if_app_is_waiting)
+{
+	struct lc_element *tmp;
+	bool throttle = drbd_rs_c_min_rate_throttle(device);
+
+	if (!throttle || throttle_if_app_is_waiting)
+		return throttle;
+
+	spin_lock_irq(&device->al_lock);
+	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
+	if (tmp) {
+		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_PRIORITY, &bm_ext->flags))
+			throttle = false;
+		/* Do not slow down if app IO is already waiting for this extent,
+		 * and our progress is necessary for application IO to complete. */
+	}
+	spin_unlock_irq(&device->al_lock);
+
+	return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
+{
+	struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+	unsigned long db, dt, dbdt;
+	unsigned int c_min_rate;
+	int curr_events;
+
+	rcu_read_lock();
+	c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
+	rcu_read_unlock();
+
+	/* feature disabled? */
+	if (c_min_rate == 0)
+		return false;
+
+	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+		      (int)part_stat_read(&disk->part0, sectors[1]) -
+			atomic_read(&device->rs_sect_ev);
+
+	if (atomic_read(&device->ap_actlog_cnt)
+	    || curr_events - device->rs_last_events > 64) {
+		unsigned long rs_left;
+		int i;
+
+		device->rs_last_events = curr_events;
+
+		/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+		 * approx. */
+		i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+
+		if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+			rs_left = device->ov_left;
+		else
+			rs_left = drbd_bm_total_weight(device) - device->rs_failed;
+
+		dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = device->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+
+		if (dbdt > c_min_rate)
+			return true;
+	}
+	return false;
+}
+
+static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	sector_t capacity;
+	struct drbd_peer_request *peer_req;
+	struct digest_info *di = NULL;
+	int size, verb;
+	unsigned int fault_type;
+	struct p_block_req *p =	pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+	capacity = drbd_get_capacity(device->this_bdev);
+
+	sector = be64_to_cpu(p->sector);
+	size   = be32_to_cpu(p->blksize);
+
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return -EINVAL;
+	}
+	if (sector + (size>>9) > capacity) {
+		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return -EINVAL;
+	}
+
+	if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
+		verb = 1;
+		switch (pi->cmd) {
+		case P_DATA_REQUEST:
+			drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
+			break;
+		case P_RS_DATA_REQUEST:
+		case P_CSUM_RS_REQUEST:
+		case P_OV_REQUEST:
+			drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
+			break;
+		case P_OV_REPLY:
+			verb = 0;
+			dec_rs_pending(device);
+			drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
+			break;
+		default:
+			BUG();
+		}
+		if (verb && __ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Can not satisfy peer's read request, "
+			    "no local data.\n");
+
+		/* drain possibly payload */
+		return drbd_drain_block(peer_device, pi->size);
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+			true /* has real payload */, GFP_NOIO);
+	if (!peer_req) {
+		put_ldev(device);
+		return -ENOMEM;
+	}
+
+	switch (pi->cmd) {
+	case P_DATA_REQUEST:
+		peer_req->w.cb = w_e_end_data_req;
+		fault_type = DRBD_FAULT_DT_RD;
+		/* application IO, don't drbd_rs_begin_io */
+		peer_req->flags |= EE_APPLICATION;
+		goto submit;
+
+	case P_RS_DATA_REQUEST:
+		peer_req->w.cb = w_e_end_rsdata_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		/* used in the sector offset progress display */
+		device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+		break;
+
+	case P_OV_REPLY:
+	case P_CSUM_RS_REQUEST:
+		fault_type = DRBD_FAULT_RS_RD;
+		di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
+		if (!di)
+			goto out_free_e;
+
+		di->digest_size = pi->size;
+		di->digest = (((char *)di)+sizeof(struct digest_info));
+
+		peer_req->digest = di;
+		peer_req->flags |= EE_HAS_DIGEST;
+
+		if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
+			goto out_free_e;
+
+		if (pi->cmd == P_CSUM_RS_REQUEST) {
+			D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+			peer_req->w.cb = w_e_end_csum_rs_req;
+			/* used in the sector offset progress display */
+			device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+			/* remember to report stats in drbd_resync_finished */
+			device->use_csums = true;
+		} else if (pi->cmd == P_OV_REPLY) {
+			/* track progress, we may need to throttle */
+			atomic_add(size >> 9, &device->rs_sect_in);
+			peer_req->w.cb = w_e_end_ov_reply;
+			dec_rs_pending(device);
+			/* drbd_rs_begin_io done when we sent this request,
+			 * but accounting still needs to be done. */
+			goto submit_for_resync;
+		}
+		break;
+
+	case P_OV_REQUEST:
+		if (device->ov_start_sector == ~(sector_t)0 &&
+		    peer_device->connection->agreed_pro_version >= 90) {
+			unsigned long now = jiffies;
+			int i;
+			device->ov_start_sector = sector;
+			device->ov_position = sector;
+			device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
+			device->rs_total = device->ov_left;
+			for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+				device->rs_mark_left[i] = device->ov_left;
+				device->rs_mark_time[i] = now;
+			}
+			drbd_info(device, "Online Verify start sector: %llu\n",
+					(unsigned long long)sector);
+		}
+		peer_req->w.cb = w_e_end_ov_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		break;
+
+	default:
+		BUG();
+	}
+
+	/* Throttle, drbd_rs_begin_io and submit should become asynchronous
+	 * wrt the receiver, but it is not as straightforward as it may seem.
+	 * Various places in the resync start and stop logic assume resync
+	 * requests are processed in order, requeuing this on the worker thread
+	 * introduces a bunch of new code for synchronization between threads.
+	 *
+	 * Unlimited throttling before drbd_rs_begin_io may stall the resync
+	 * "forever", throttling after drbd_rs_begin_io will lock that extent
+	 * for application writes for the same time.  For now, just throttle
+	 * here, where the rest of the code expects the receiver to sleep for
+	 * a while, anyways.
+	 */
+
+	/* Throttle before drbd_rs_begin_io, as that locks out application IO;
+	 * this defers syncer requests for some time, before letting at least
+	 * on request through.  The resync controller on the receiving side
+	 * will adapt to the incoming rate accordingly.
+	 *
+	 * We cannot throttle here if remote is Primary/SyncTarget:
+	 * we would also throttle its application reads.
+	 * In that case, throttling is done on the SyncTarget only.
+	 */
+
+	/* Even though this may be a resync request, we do add to "read_ee";
+	 * "sync_ee" is only used for resync WRITEs.
+	 * Add to list early, so debugfs can find this request
+	 * even if we have to sleep below. */
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	update_receiver_timing_details(connection, drbd_rs_should_slow_down);
+	if (device->state.peer != R_PRIMARY
+	&& drbd_rs_should_slow_down(device, sector, false))
+		schedule_timeout_uninterruptible(HZ/10);
+	update_receiver_timing_details(connection, drbd_rs_begin_io);
+	if (drbd_rs_begin_io(device, sector))
+		goto out_free_e;
+
+submit_for_resync:
+	atomic_add(size >> 9, &device->rs_sect_ev);
+
+submit:
+	update_receiver_timing_details(connection, drbd_submit_peer_request);
+	inc_unacked(device);
+	if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
+	put_ldev(device);
+	drbd_free_peer_req(device, peer_req);
+	return -EIO;
+}
+
+/**
+ * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
+ */
+static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int self, peer, rv = -100;
+	unsigned long ch_self, ch_peer;
+	enum drbd_after_sb_p after_sb_0p;
+
+	self = device->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = device->p_uuid[UI_BITMAP] & 1;
+
+	ch_peer = device->p_uuid[UI_SIZE];
+	ch_self = device->comm_bm_set;
+
+	rcu_read_lock();
+	after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
+	rcu_read_unlock();
+	switch (after_sb_0p) {
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_CALL_HELPER:
+	case ASB_VIOLENTLY:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_DISCARD_YOUNGER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = -1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv =  1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+	case ASB_DISCARD_OLDER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = 1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv = -1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+		drbd_warn(device, "Discard younger/older primary did not find a decision\n"
+		     "Using discard-least-changes instead\n");
+	case ASB_DISCARD_ZERO_CHG:
+		if (ch_peer == 0 && ch_self == 0) {
+			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+				? -1 : 1;
+			break;
+		} else {
+			if (ch_peer == 0) { rv =  1; break; }
+			if (ch_self == 0) { rv = -1; break; }
+		}
+		if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
+			break;
+	case ASB_DISCARD_LEAST_CHG:
+		if	(ch_self < ch_peer)
+			rv = -1;
+		else if (ch_self > ch_peer)
+			rv =  1;
+		else /* ( ch_self == ch_peer ) */
+		     /* Well, then use something else. */
+			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+				? -1 : 1;
+		break;
+	case ASB_DISCARD_LOCAL:
+		rv = -1;
+		break;
+	case ASB_DISCARD_REMOTE:
+		rv =  1;
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
+ */
+static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int hg, rv = -100;
+	enum drbd_after_sb_p after_sb_1p;
+
+	rcu_read_lock();
+	after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
+	rcu_read_unlock();
+	switch (after_sb_1p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_DISCARD_ZERO_CHG:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CONSENSUS:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1 && device->state.role == R_SECONDARY)
+			rv = hg;
+		if (hg == 1  && device->state.role == R_PRIMARY)
+			rv = hg;
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(peer_device);
+		break;
+	case ASB_DISCARD_SECONDARY:
+		return device->state.role == R_PRIMARY ? 1 : -1;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1 && device->state.role == R_PRIMARY) {
+			enum drbd_state_rv rv2;
+
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
+				drbd_khelper(device, "pri-lost-after-sb");
+			} else {
+				drbd_warn(device, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
+ */
+static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int hg, rv = -100;
+	enum drbd_after_sb_p after_sb_2p;
+
+	rcu_read_lock();
+	after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
+	rcu_read_unlock();
+	switch (after_sb_2p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_DISCARD_ZERO_CHG:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(peer_device);
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1) {
+			enum drbd_state_rv rv2;
+
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
+				drbd_khelper(device, "pri-lost-after-sb");
+			} else {
+				drbd_warn(device, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
+			   u64 bits, u64 flags)
+{
+	if (!uuid) {
+		drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
+		return;
+	}
+	drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+	     text,
+	     (unsigned long long)uuid[UI_CURRENT],
+	     (unsigned long long)uuid[UI_BITMAP],
+	     (unsigned long long)uuid[UI_HISTORY_START],
+	     (unsigned long long)uuid[UI_HISTORY_END],
+	     (unsigned long long)bits,
+	     (unsigned long long)flags);
+}
+
+/*
+  100	after split brain try auto recover
+    2	C_SYNC_SOURCE set BitMap
+    1	C_SYNC_SOURCE use BitMap
+    0	no Sync
+   -1	C_SYNC_TARGET use BitMap
+   -2	C_SYNC_TARGET set BitMap
+ -100	after split brain, disconnect
+-1000	unrelated data
+-1091   requires proto 91
+-1096   requires proto 96
+ */
+static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
+{
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+	u64 self, peer;
+	int i, j;
+
+	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+
+	*rule_nr = 10;
+	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+		return 0;
+
+	*rule_nr = 20;
+	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+	     peer != UUID_JUST_CREATED)
+		return -2;
+
+	*rule_nr = 30;
+	if (self != UUID_JUST_CREATED &&
+	    (peer == UUID_JUST_CREATED || peer == (u64)0))
+		return 2;
+
+	if (self == peer) {
+		int rct, dc; /* roles at crash time */
+
+		if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+			if (connection->agreed_pro_version < 91)
+				return -1091;
+
+			if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+			    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+				drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
+				drbd_uuid_move_history(device);
+				device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+				device->ldev->md.uuid[UI_BITMAP] = 0;
+
+				drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+					       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+				*rule_nr = 34;
+			} else {
+				drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
+				*rule_nr = 36;
+			}
+
+			return 1;
+		}
+
+		if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
+
+			if (connection->agreed_pro_version < 91)
+				return -1091;
+
+			if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+			    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+				drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+				device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
+				device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
+				device->p_uuid[UI_BITMAP] = 0UL;
+
+				drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+				*rule_nr = 35;
+			} else {
+				drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
+				*rule_nr = 37;
+			}
+
+			return -1;
+		}
+
+		/* Common power [off|failure] */
+		rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
+			(device->p_uuid[UI_FLAGS] & 2);
+		/* lowest bit is set when we were primary,
+		 * next bit (weight 2) is set when peer was primary */
+		*rule_nr = 40;
+
+		switch (rct) {
+		case 0: /* !self_pri && !peer_pri */ return 0;
+		case 1: /*  self_pri && !peer_pri */ return 1;
+		case 2: /* !self_pri &&  peer_pri */ return -1;
+		case 3: /*  self_pri &&  peer_pri */
+			dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+			return dc ? -1 : 1;
+		}
+	}
+
+	*rule_nr = 50;
+	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer)
+		return -1;
+
+	*rule_nr = 51;
+	peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		if (connection->agreed_pro_version < 96 ?
+		    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
+		    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
+		    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of the peer's UUIDs. */
+
+			if (connection->agreed_pro_version < 91)
+				return -1091;
+
+			device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
+			device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
+
+			drbd_info(device, "Lost last syncUUID packet, corrected:\n");
+			drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+			return -1;
+		}
+	}
+
+	*rule_nr = 60;
+	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		peer = device->p_uuid[i] & ~((u64)1);
+		if (self == peer)
+			return -2;
+	}
+
+	*rule_nr = 70;
+	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+	if (self == peer)
+		return 1;
+
+	*rule_nr = 71;
+	self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		if (connection->agreed_pro_version < 96 ?
+		    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
+		    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
+		    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of our UUIDs. */
+
+			if (connection->agreed_pro_version < 91)
+				return -1091;
+
+			__drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
+			__drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+			drbd_info(device, "Last syncUUID did not get through, corrected:\n");
+			drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+				       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+
+			return 1;
+		}
+	}
+
+
+	*rule_nr = 80;
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = device->ldev->md.uuid[i] & ~((u64)1);
+		if (self == peer)
+			return 2;
+	}
+
+	*rule_nr = 90;
+	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer && self != ((u64)0))
+		return 100;
+
+	*rule_nr = 100;
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = device->ldev->md.uuid[i] & ~((u64)1);
+		for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+			peer = device->p_uuid[j] & ~((u64)1);
+			if (self == peer)
+				return -100;
+		}
+	}
+
+	return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+   CONN_MASK (-1) on failure.
+ */
+static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
+					   enum drbd_role peer_role,
+					   enum drbd_disk_state peer_disk) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	enum drbd_conns rv = C_MASK;
+	enum drbd_disk_state mydisk;
+	struct net_conf *nc;
+	int hg, rule_nr, rr_conflict, tentative;
+
+	mydisk = device->state.disk;
+	if (mydisk == D_NEGOTIATING)
+		mydisk = device->new_state_tmp.disk;
+
+	drbd_info(device, "drbd_sync_handshake:\n");
+
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
+	drbd_uuid_dump(device, "peer", device->p_uuid,
+		       device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+	hg = drbd_uuid_compare(device, &rule_nr);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+	if (hg == -1000) {
+		drbd_alert(device, "Unrelated data, aborting!\n");
+		return C_MASK;
+	}
+	if (hg < -1000) {
+		drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
+		return C_MASK;
+	}
+
+	if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+	    (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
+		int f = (hg == -100) || abs(hg) == 2;
+		hg = mydisk > D_INCONSISTENT ? 1 : -1;
+		if (f)
+			hg = hg*2;
+		drbd_info(device, "Becoming sync %s due to disk states.\n",
+		     hg > 0 ? "source" : "target");
+	}
+
+	if (abs(hg) == 100)
+		drbd_khelper(device, "initial-split-brain");
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+
+	if (hg == 100 || (hg == -100 && nc->always_asbp)) {
+		int pcount = (device->state.role == R_PRIMARY)
+			   + (peer_role == R_PRIMARY);
+		int forced = (hg == -100);
+
+		switch (pcount) {
+		case 0:
+			hg = drbd_asb_recover_0p(peer_device);
+			break;
+		case 1:
+			hg = drbd_asb_recover_1p(peer_device);
+			break;
+		case 2:
+			hg = drbd_asb_recover_2p(peer_device);
+			break;
+		}
+		if (abs(hg) < 100) {
+			drbd_warn(device, "Split-Brain detected, %d primaries, "
+			     "automatically solved. Sync from %s node\n",
+			     pcount, (hg < 0) ? "peer" : "this");
+			if (forced) {
+				drbd_warn(device, "Doing a full sync, since"
+				     " UUIDs where ambiguous.\n");
+				hg = hg*2;
+			}
+		}
+	}
+
+	if (hg == -100) {
+		if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
+			hg = -1;
+		if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
+			hg = 1;
+
+		if (abs(hg) < 100)
+			drbd_warn(device, "Split-Brain detected, manually solved. "
+			     "Sync from %s node\n",
+			     (hg < 0) ? "peer" : "this");
+	}
+	rr_conflict = nc->rr_conflict;
+	tentative = nc->tentative;
+	rcu_read_unlock();
+
+	if (hg == -100) {
+		/* FIXME this log message is not correct if we end up here
+		 * after an attempted attach on a diskless node.
+		 * We just refuse to attach -- well, we drop the "connection"
+		 * to that disk, in a way... */
+		drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
+		drbd_khelper(device, "split-brain");
+		return C_MASK;
+	}
+
+	if (hg > 0 && mydisk <= D_INCONSISTENT) {
+		drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
+		return C_MASK;
+	}
+
+	if (hg < 0 && /* by intention we do not use mydisk here. */
+	    device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
+		switch (rr_conflict) {
+		case ASB_CALL_HELPER:
+			drbd_khelper(device, "pri-lost");
+			/* fall through */
+		case ASB_DISCONNECT:
+			drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
+			return C_MASK;
+		case ASB_VIOLENTLY:
+			drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
+			     "assumption\n");
+		}
+	}
+
+	if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
+		if (hg == 0)
+			drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
+		else
+			drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
+				 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
+				 abs(hg) >= 2 ? "full" : "bit-map based");
+		return C_MASK;
+	}
+
+	if (abs(hg) >= 2) {
+		drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
+					BM_LOCKED_SET_ALLOWED))
+			return C_MASK;
+	}
+
+	if (hg > 0) { /* become sync source. */
+		rv = C_WF_BITMAP_S;
+	} else if (hg < 0) { /* become sync target */
+		rv = C_WF_BITMAP_T;
+	} else {
+		rv = C_CONNECTED;
+		if (drbd_bm_total_weight(device)) {
+			drbd_info(device, "No resync, but %lu bits in bitmap!\n",
+			     drbd_bm_total_weight(device));
+		}
+	}
+
+	return rv;
+}
+
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
+{
+	/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+	if (peer == ASB_DISCARD_REMOTE)
+		return ASB_DISCARD_LOCAL;
+
+	/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+	if (peer == ASB_DISCARD_LOCAL)
+		return ASB_DISCARD_REMOTE;
+
+	/* everything else is valid if they are equal on both sides. */
+	return peer;
+}
+
+static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_protocol *p = pi->data;
+	enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+	int p_proto, p_discard_my_data, p_two_primaries, cf;
+	struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+	char integrity_alg[SHARED_SECRET_MAX] = "";
+	struct crypto_hash *peer_integrity_tfm = NULL;
+	void *int_dig_in = NULL, *int_dig_vv = NULL;
+
+	p_proto		= be32_to_cpu(p->protocol);
+	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
+	p_after_sb_1p	= be32_to_cpu(p->after_sb_1p);
+	p_after_sb_2p	= be32_to_cpu(p->after_sb_2p);
+	p_two_primaries = be32_to_cpu(p->two_primaries);
+	cf		= be32_to_cpu(p->conn_flags);
+	p_discard_my_data = cf & CF_DISCARD_MY_DATA;
+
+	if (connection->agreed_pro_version >= 87) {
+		int err;
+
+		if (pi->size > sizeof(integrity_alg))
+			return -EIO;
+		err = drbd_recv_all(connection, integrity_alg, pi->size);
+		if (err)
+			return err;
+		integrity_alg[SHARED_SECRET_MAX - 1] = 0;
+	}
+
+	if (pi->cmd != P_PROTOCOL_UPDATE) {
+		clear_bit(CONN_DRY_RUN, &connection->flags);
+
+		if (cf & CF_DRY_RUN)
+			set_bit(CONN_DRY_RUN, &connection->flags);
+
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+
+		if (p_proto != nc->wire_protocol) {
+			drbd_err(connection, "incompatible %s settings\n", "protocol");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (p_discard_my_data && nc->discard_my_data) {
+			drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (p_two_primaries != nc->two_primaries) {
+			drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (strcmp(integrity_alg, nc->integrity_alg)) {
+			drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
+			goto disconnect_rcu_unlock;
+		}
+
+		rcu_read_unlock();
+	}
+
+	if (integrity_alg[0]) {
+		int hash_size;
+
+		/*
+		 * We can only change the peer data integrity algorithm
+		 * here.  Changing our own data integrity algorithm
+		 * requires that we send a P_PROTOCOL_UPDATE packet at
+		 * the same time; otherwise, the peer has no way to
+		 * tell between which packets the algorithm should
+		 * change.
+		 */
+
+		peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (!peer_integrity_tfm) {
+			drbd_err(connection, "peer data-integrity-alg %s not supported\n",
+				 integrity_alg);
+			goto disconnect;
+		}
+
+		hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+		int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+		int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+		if (!(int_dig_in && int_dig_vv)) {
+			drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
+			goto disconnect;
+		}
+	}
+
+	new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		drbd_err(connection, "Allocation of new net_conf failed\n");
+		goto disconnect;
+	}
+
+	mutex_lock(&connection->data.mutex);
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = connection->net_conf;
+	*new_net_conf = *old_net_conf;
+
+	new_net_conf->wire_protocol = p_proto;
+	new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+	new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+	new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+	new_net_conf->two_primaries = p_two_primaries;
+
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+
+	crypto_free_hash(connection->peer_integrity_tfm);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+	connection->peer_integrity_tfm = peer_integrity_tfm;
+	connection->int_dig_in = int_dig_in;
+	connection->int_dig_vv = int_dig_vv;
+
+	if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+		drbd_info(connection, "peer data-integrity-alg: %s\n",
+			  integrity_alg[0] ? integrity_alg : "(none)");
+
+	synchronize_rcu();
+	kfree(old_net_conf);
+	return 0;
+
+disconnect_rcu_unlock:
+	rcu_read_unlock();
+disconnect:
+	crypto_free_hash(peer_integrity_tfm);
+	kfree(int_dig_in);
+	kfree(int_dig_vv);
+	conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	return -EIO;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ *         ERR_PTR(error) if something goes wrong
+ *         or the crypto hash ptr, if it worked out ok. */
+static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
+		const char *alg, const char *name)
+{
+	struct crypto_hash *tfm;
+
+	if (!alg[0])
+		return NULL;
+
+	tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+			alg, name, PTR_ERR(tfm));
+		return tfm;
+	}
+	return tfm;
+}
+
+static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
+{
+	void *buffer = connection->data.rbuf;
+	int size = pi->size;
+
+	while (size) {
+		int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+		s = drbd_recv(connection, buffer, s);
+		if (s <= 0) {
+			if (s < 0)
+				return s;
+			break;
+		}
+		size -= s;
+	}
+	if (size)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * config_unknown_volume  -  device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet.  It will warn and ignore these
+ * commands.  Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
+{
+	drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
+		  cmdname(pi->cmd), pi->vnr);
+	return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_rs_param_95 *p;
+	unsigned int header_size, data_size, exp_max_sz;
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+	const int apv = connection->agreed_pro_version;
+	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+	int fifo_size = 0;
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
+		    : apv == 88 ? sizeof(struct p_rs_param)
+					+ SHARED_SECRET_MAX
+		    : apv <= 94 ? sizeof(struct p_rs_param_89)
+		    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+	if (pi->size > exp_max_sz) {
+		drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+		    pi->size, exp_max_sz);
+		return -EIO;
+	}
+
+	if (apv <= 88) {
+		header_size = sizeof(struct p_rs_param);
+		data_size = pi->size - header_size;
+	} else if (apv <= 94) {
+		header_size = sizeof(struct p_rs_param_89);
+		data_size = pi->size - header_size;
+		D_ASSERT(device, data_size == 0);
+	} else {
+		header_size = sizeof(struct p_rs_param_95);
+		data_size = pi->size - header_size;
+		D_ASSERT(device, data_size == 0);
+	}
+
+	/* initialize verify_alg and csums_alg */
+	p = pi->data;
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+	err = drbd_recv_all(peer_device->connection, p, header_size);
+	if (err)
+		return err;
+
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = peer_device->connection->net_conf;
+	if (get_ldev(device)) {
+		new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
+			put_ldev(device);
+			mutex_unlock(&connection->resource->conf_update);
+			drbd_err(device, "Allocation of new disk_conf failed\n");
+			return -ENOMEM;
+		}
+
+		old_disk_conf = device->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+
+		new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+	}
+
+	if (apv >= 88) {
+		if (apv == 88) {
+			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+				drbd_err(device, "verify-alg of wrong size, "
+					"peer wants %u, accepting only up to %u byte\n",
+					data_size, SHARED_SECRET_MAX);
+				err = -EIO;
+				goto reconnect;
+			}
+
+			err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
+			if (err)
+				goto reconnect;
+			/* we expect NUL terminated string */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(device, p->verify_alg[data_size-1] == 0);
+			p->verify_alg[data_size-1] = 0;
+
+		} else /* apv >= 89 */ {
+			/* we still expect NUL terminated strings */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+			D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+			p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+		}
+
+		if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
+			if (device->state.conn == C_WF_REPORT_PARAMS) {
+				drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    old_net_conf->verify_alg, p->verify_alg);
+				goto disconnect;
+			}
+			verify_tfm = drbd_crypto_alloc_digest_safe(device,
+					p->verify_alg, "verify-alg");
+			if (IS_ERR(verify_tfm)) {
+				verify_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
+			if (device->state.conn == C_WF_REPORT_PARAMS) {
+				drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    old_net_conf->csums_alg, p->csums_alg);
+				goto disconnect;
+			}
+			csums_tfm = drbd_crypto_alloc_digest_safe(device,
+					p->csums_alg, "csums-alg");
+			if (IS_ERR(csums_tfm)) {
+				csums_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv > 94 && new_disk_conf) {
+			new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+			new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+			new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+			new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+
+			fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+			if (fifo_size != device->rs_plan_s->size) {
+				new_plan = fifo_alloc(fifo_size);
+				if (!new_plan) {
+					drbd_err(device, "kmalloc of fifo_buffer failed");
+					put_ldev(device);
+					goto disconnect;
+				}
+			}
+		}
+
+		if (verify_tfm || csums_tfm) {
+			new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+			if (!new_net_conf) {
+				drbd_err(device, "Allocation of new net_conf failed\n");
+				goto disconnect;
+			}
+
+			*new_net_conf = *old_net_conf;
+
+			if (verify_tfm) {
+				strcpy(new_net_conf->verify_alg, p->verify_alg);
+				new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+				crypto_free_hash(peer_device->connection->verify_tfm);
+				peer_device->connection->verify_tfm = verify_tfm;
+				drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
+			}
+			if (csums_tfm) {
+				strcpy(new_net_conf->csums_alg, p->csums_alg);
+				new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+				crypto_free_hash(peer_device->connection->csums_tfm);
+				peer_device->connection->csums_tfm = csums_tfm;
+				drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
+			}
+			rcu_assign_pointer(connection->net_conf, new_net_conf);
+		}
+	}
+
+	if (new_disk_conf) {
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		put_ldev(device);
+	}
+
+	if (new_plan) {
+		old_plan = device->rs_plan_s;
+		rcu_assign_pointer(device->rs_plan_s, new_plan);
+	}
+
+	mutex_unlock(&connection->resource->conf_update);
+	synchronize_rcu();
+	if (new_net_conf)
+		kfree(old_net_conf);
+	kfree(old_disk_conf);
+	kfree(old_plan);
+
+	return 0;
+
+reconnect:
+	if (new_disk_conf) {
+		put_ldev(device);
+		kfree(new_disk_conf);
+	}
+	mutex_unlock(&connection->resource->conf_update);
+	return -EIO;
+
+disconnect:
+	kfree(new_plan);
+	if (new_disk_conf) {
+		put_ldev(device);
+		kfree(new_disk_conf);
+	}
+	mutex_unlock(&connection->resource->conf_update);
+	/* just for completeness: actually not needed,
+	 * as this is not reached if csums_tfm was ok. */
+	crypto_free_hash(csums_tfm);
+	/* but free the verify_tfm again, if csums_tfm did not work out */
+	crypto_free_hash(verify_tfm);
+	conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	return -EIO;
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_device *device,
+	const char *s, sector_t a, sector_t b)
+{
+	sector_t d;
+	if (a == 0 || b == 0)
+		return;
+	d = (a > b) ? (a - b) : (b - a);
+	if (d > (a>>3) || d > (b>>3))
+		drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
+		     (unsigned long long)a, (unsigned long long)b);
+}
+
+static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_sizes *p = pi->data;
+	enum determine_dev_size dd = DS_UNCHANGED;
+	sector_t p_size, p_usize, p_csize, my_usize;
+	int ldsc = 0; /* local disk size changed */
+	enum dds_flags ddsf;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	p_size = be64_to_cpu(p->d_size);
+	p_usize = be64_to_cpu(p->u_size);
+	p_csize = be64_to_cpu(p->c_size);
+
+	/* just store the peer's disk size for now.
+	 * we still need to figure out whether we accept that. */
+	device->p_size = p_size;
+
+	if (get_ldev(device)) {
+		rcu_read_lock();
+		my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
+		rcu_read_unlock();
+
+		warn_if_differ_considerably(device, "lower level device sizes",
+			   p_size, drbd_get_max_capacity(device->ldev));
+		warn_if_differ_considerably(device, "user requested size",
+					    p_usize, my_usize);
+
+		/* if this is the first connect, or an otherwise expected
+		 * param exchange, choose the minimum */
+		if (device->state.conn == C_WF_REPORT_PARAMS)
+			p_usize = min_not_zero(my_usize, p_usize);
+
+		/* Never shrink a device with usable data during connect.
+		   But allow online shrinking if we are connected. */
+		if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
+		    drbd_get_capacity(device->this_bdev) &&
+		    device->state.disk >= D_OUTDATED &&
+		    device->state.conn < C_CONNECTED) {
+			drbd_err(device, "The peer's disk size is too small!\n");
+			conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+			put_ldev(device);
+			return -EIO;
+		}
+
+		if (my_usize != p_usize) {
+			struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+			new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+			if (!new_disk_conf) {
+				drbd_err(device, "Allocation of new disk_conf failed\n");
+				put_ldev(device);
+				return -ENOMEM;
+			}
+
+			mutex_lock(&connection->resource->conf_update);
+			old_disk_conf = device->ldev->disk_conf;
+			*new_disk_conf = *old_disk_conf;
+			new_disk_conf->disk_size = p_usize;
+
+			rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+			mutex_unlock(&connection->resource->conf_update);
+			synchronize_rcu();
+			kfree(old_disk_conf);
+
+			drbd_info(device, "Peer sets u_size to %lu sectors\n",
+				 (unsigned long)my_usize);
+		}
+
+		put_ldev(device);
+	}
+
+	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+	/* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+	   drbd_reconsider_max_bio_size(), we can be sure that after
+	   drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+
+	ddsf = be16_to_cpu(p->dds_flags);
+	if (get_ldev(device)) {
+		drbd_reconsider_max_bio_size(device, device->ldev);
+		dd = drbd_determine_dev_size(device, ddsf, NULL);
+		put_ldev(device);
+		if (dd == DS_ERROR)
+			return -EIO;
+		drbd_md_sync(device);
+	} else {
+		/*
+		 * I am diskless, need to accept the peer's *current* size.
+		 * I must NOT accept the peers backing disk size,
+		 * it may have been larger than mine all along...
+		 *
+		 * At this point, the peer knows more about my disk, or at
+		 * least about what we last agreed upon, than myself.
+		 * So if his c_size is less than his d_size, the most likely
+		 * reason is that *my* d_size was smaller last time we checked.
+		 *
+		 * However, if he sends a zero current size,
+		 * take his (user-capped or) backing disk size anyways.
+		 */
+		drbd_reconsider_max_bio_size(device, NULL);
+		drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
+	}
+
+	if (get_ldev(device)) {
+		if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
+			device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+			ldsc = 1;
+		}
+
+		put_ldev(device);
+	}
+
+	if (device->state.conn > C_WF_REPORT_PARAMS) {
+		if (be64_to_cpu(p->c_size) !=
+		    drbd_get_capacity(device->this_bdev) || ldsc) {
+			/* we have different sizes, probably peer
+			 * needs to know my new size... */
+			drbd_send_sizes(peer_device, 0, ddsf);
+		}
+		if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
+		    (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
+			if (device->state.pdsk >= D_INCONSISTENT &&
+			    device->state.disk >= D_INCONSISTENT) {
+				if (ddsf & DDSF_NO_RESYNC)
+					drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
+				else
+					resync_after_online_grow(device);
+			} else
+				set_bit(RESYNC_AFTER_NEG, &device->flags);
+		}
+	}
+
+	return 0;
+}
+
+static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_uuids *p = pi->data;
+	u64 *p_uuid;
+	int i, updated_uuids = 0;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+	if (!p_uuid) {
+		drbd_err(device, "kmalloc of p_uuid failed\n");
+		return false;
+	}
+
+	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+		p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+	kfree(device->p_uuid);
+	device->p_uuid = p_uuid;
+
+	if (device->state.conn < C_CONNECTED &&
+	    device->state.disk < D_INCONSISTENT &&
+	    device->state.role == R_PRIMARY &&
+	    (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+		drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
+		    (unsigned long long)device->ed_uuid);
+		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		return -EIO;
+	}
+
+	if (get_ldev(device)) {
+		int skip_initial_sync =
+			device->state.conn == C_CONNECTED &&
+			peer_device->connection->agreed_pro_version >= 90 &&
+			device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+			(p_uuid[UI_FLAGS] & 8);
+		if (skip_initial_sync) {
+			drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
+			drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+					"clear_n_write from receive_uuids",
+					BM_LOCKED_TEST_ALLOWED);
+			_drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
+			_drbd_uuid_set(device, UI_BITMAP, 0);
+			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			drbd_md_sync(device);
+			updated_uuids = 1;
+		}
+		put_ldev(device);
+	} else if (device->state.disk < D_INCONSISTENT &&
+		   device->state.role == R_PRIMARY) {
+		/* I am a diskless primary, the peer just created a new current UUID
+		   for me. */
+		updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+	}
+
+	/* Before we test for the disk state, we should wait until an eventually
+	   ongoing cluster wide state change is finished. That is important if
+	   we are primary and are detaching from our disk. We need to see the
+	   new disk state... */
+	mutex_lock(device->state_mutex);
+	mutex_unlock(device->state_mutex);
+	if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
+		updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+
+	if (updated_uuids)
+		drbd_print_uuids(device, "receiver updated UUIDs to");
+
+	return 0;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps:		The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state ps)
+{
+	union drbd_state ms;
+
+	static enum drbd_conns c_tab[] = {
+		[C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
+		[C_CONNECTED] = C_CONNECTED,
+
+		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+		[C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+		[C_VERIFY_S]       = C_VERIFY_T,
+		[C_MASK]   = C_MASK,
+	};
+
+	ms.i = ps.i;
+
+	ms.conn = c_tab[ps.conn];
+	ms.peer = ps.role;
+	ms.role = ps.peer;
+	ms.pdsk = ps.disk;
+	ms.disk = ps.pdsk;
+	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+	return ms;
+}
+
+static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_req_state *p = pi->data;
+	union drbd_state mask, val;
+	enum drbd_state_rv rv;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
+	    mutex_is_locked(device->state_mutex)) {
+		drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
+		return 0;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = drbd_change_state(device, CS_VERBOSE, mask, val);
+	drbd_send_sr_reply(peer_device, rv);
+
+	drbd_md_sync(device);
+
+	return 0;
+}
+
+static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_req_state *p = pi->data;
+	union drbd_state mask, val;
+	enum drbd_state_rv rv;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
+	    mutex_is_locked(&connection->cstate_mutex)) {
+		conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
+		return 0;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+	conn_send_sr_reply(connection, rv);
+
+	return 0;
+}
+
+static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_state *p = pi->data;
+	union drbd_state os, ns, peer_state;
+	enum drbd_disk_state real_peer_disk;
+	enum chg_state_flags cs_flags;
+	int rv;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	peer_state.i = be32_to_cpu(p->state);
+
+	real_peer_disk = peer_state.disk;
+	if (peer_state.disk == D_NEGOTIATING) {
+		real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+		drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+	}
+
+	spin_lock_irq(&device->resource->req_lock);
+ retry:
+	os = ns = drbd_read_state(device);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	/* If some other part of the code (asender thread, timeout)
+	 * already decided to close the connection again,
+	 * we must not "re-establish" it here. */
+	if (os.conn <= C_TEAR_DOWN)
+		return -ECONNRESET;
+
+	/* If this is the "end of sync" confirmation, usually the peer disk
+	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+	 * set) resync started in PausedSyncT, or if the timing of pause-/
+	 * unpause-sync events has been "just right", the peer disk may
+	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+	 */
+	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+	    real_peer_disk == D_UP_TO_DATE &&
+	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+		/* If we are (becoming) SyncSource, but peer is still in sync
+		 * preparation, ignore its uptodate-ness to avoid flapping, it
+		 * will change to inconsistent once the peer reaches active
+		 * syncing states.
+		 * It may have changed syncer-paused flags, however, so we
+		 * cannot ignore this completely. */
+		if (peer_state.conn > C_CONNECTED &&
+		    peer_state.conn < C_SYNC_SOURCE)
+			real_peer_disk = D_INCONSISTENT;
+
+		/* if peer_state changes to connected at the same time,
+		 * it explicitly notifies us that it finished resync.
+		 * Maybe we should finish it up, too? */
+		else if (os.conn >= C_SYNC_SOURCE &&
+			 peer_state.conn == C_CONNECTED) {
+			if (drbd_bm_total_weight(device) <= device->rs_failed)
+				drbd_resync_finished(device);
+			return 0;
+		}
+	}
+
+	/* explicit verify finished notification, stop sector reached. */
+	if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
+	    peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
+		ov_out_of_sync_print(device);
+		drbd_resync_finished(device);
+		return 0;
+	}
+
+	/* peer says his disk is inconsistent, while we think it is uptodate,
+	 * and this happens while the peer still thinks we have a sync going on,
+	 * but we think we are already done with the sync.
+	 * We ignore this to avoid flapping pdsk.
+	 * This should not happen, if the peer is a recent version of drbd. */
+	if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+	    os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+		real_peer_disk = D_UP_TO_DATE;
+
+	if (ns.conn == C_WF_REPORT_PARAMS)
+		ns.conn = C_CONNECTED;
+
+	if (peer_state.conn == C_AHEAD)
+		ns.conn = C_BEHIND;
+
+	if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+	    get_ldev_if_state(device, D_NEGOTIATING)) {
+		int cr; /* consider resync */
+
+		/* if we established a new connection */
+		cr  = (os.conn < C_CONNECTED);
+		/* if we had an established connection
+		 * and one of the nodes newly attaches a disk */
+		cr |= (os.conn == C_CONNECTED &&
+		       (peer_state.disk == D_NEGOTIATING ||
+			os.disk == D_NEGOTIATING));
+		/* if we have both been inconsistent, and the peer has been
+		 * forced to be UpToDate with --overwrite-data */
+		cr |= test_bit(CONSIDER_RESYNC, &device->flags);
+		/* if we had been plain connected, and the admin requested to
+		 * start a sync by "invalidate" or "invalidate-remote" */
+		cr |= (os.conn == C_CONNECTED &&
+				(peer_state.conn >= C_STARTING_SYNC_S &&
+				 peer_state.conn <= C_WF_BITMAP_T));
+
+		if (cr)
+			ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
+
+		put_ldev(device);
+		if (ns.conn == C_MASK) {
+			ns.conn = C_CONNECTED;
+			if (device->state.disk == D_NEGOTIATING) {
+				drbd_force_state(device, NS(disk, D_FAILED));
+			} else if (peer_state.disk == D_NEGOTIATING) {
+				drbd_err(device, "Disk attach process on the peer node was aborted.\n");
+				peer_state.disk = D_DISKLESS;
+				real_peer_disk = D_DISKLESS;
+			} else {
+				if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
+					return -EIO;
+				D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
+				conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+				return -EIO;
+			}
+		}
+	}
+
+	spin_lock_irq(&device->resource->req_lock);
+	if (os.i != drbd_read_state(device).i)
+		goto retry;
+	clear_bit(CONSIDER_RESYNC, &device->flags);
+	ns.peer = peer_state.role;
+	ns.pdsk = real_peer_disk;
+	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+	if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+		ns.disk = device->new_state_tmp.disk;
+	cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+	if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+	    test_bit(NEW_CUR_UUID, &device->flags)) {
+		/* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
+		   for temporal network outages! */
+		spin_unlock_irq(&device->resource->req_lock);
+		drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+		tl_clear(peer_device->connection);
+		drbd_uuid_new_current(device);
+		clear_bit(NEW_CUR_UUID, &device->flags);
+		conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+		return -EIO;
+	}
+	rv = _drbd_set_state(device, ns, cs_flags, NULL);
+	ns = drbd_read_state(device);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (rv < SS_SUCCESS) {
+		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		return -EIO;
+	}
+
+	if (os.conn > C_WF_REPORT_PARAMS) {
+		if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+		    peer_state.disk != D_NEGOTIATING ) {
+			/* we want resync, peer has not yet decided to sync... */
+			/* Nowadays only used when forcing a node into primary role and
+			   setting its disk to UpToDate with that */
+			drbd_send_uuids(peer_device);
+			drbd_send_current_state(peer_device);
+		}
+	}
+
+	clear_bit(DISCARD_MY_DATA, &device->flags);
+
+	drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
+
+	return 0;
+}
+
+static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_rs_uuid *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	wait_event(device->misc_wait,
+		   device->state.conn == C_WF_SYNC_UUID ||
+		   device->state.conn == C_BEHIND ||
+		   device->state.conn < C_CONNECTED ||
+		   device->state.disk < D_NEGOTIATING);
+
+	/* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
+
+	/* Here the _drbd_uuid_ functions are right, current should
+	   _not_ be rotated into the history */
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		_drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
+		_drbd_uuid_set(device, UI_BITMAP, 0UL);
+
+		drbd_print_uuids(device, "updated sync uuid");
+		drbd_start_resync(device, C_SYNC_TARGET);
+
+		put_ldev(device);
+	} else
+		drbd_err(device, "Ignoring SyncUUID packet!\n");
+
+	return 0;
+}
+
+/**
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
+		     unsigned long *p, struct bm_xfer_ctx *c)
+{
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+				 drbd_header_size(peer_device->connection);
+	unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+				       c->bm_words - c->word_offset);
+	unsigned int want = num_words * sizeof(*p);
+	int err;
+
+	if (want != size) {
+		drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
+		return -EIO;
+	}
+	if (want == 0)
+		return 0;
+	err = drbd_recv_all(peer_device->connection, p, want);
+	if (err)
+		return err;
+
+	drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
+
+	c->word_offset += num_words;
+	c->bit_offset = c->word_offset * BITS_PER_LONG;
+	if (c->bit_offset > c->bm_bits)
+		c->bit_offset = c->bm_bits;
+
+	return 1;
+}
+
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+	return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+	return (p->encoding >> 4) & 0x7;
+}
+
+/**
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+recv_bm_rle_bits(struct drbd_peer_device *peer_device,
+		struct p_compressed_bm *p,
+		 struct bm_xfer_ctx *c,
+		 unsigned int len)
+{
+	struct bitstream bs;
+	u64 look_ahead;
+	u64 rl;
+	u64 tmp;
+	unsigned long s = c->bit_offset;
+	unsigned long e;
+	int toggle = dcbp_get_start(p);
+	int have;
+	int bits;
+
+	bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
+
+	bits = bitstream_get_bits(&bs, &look_ahead, 64);
+	if (bits < 0)
+		return -EIO;
+
+	for (have = bits; have > 0; s += rl, toggle = !toggle) {
+		bits = vli_decode_bits(&rl, look_ahead);
+		if (bits <= 0)
+			return -EIO;
+
+		if (toggle) {
+			e = s + rl -1;
+			if (e >= c->bm_bits) {
+				drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+				return -EIO;
+			}
+			_drbd_bm_set_bits(peer_device->device, s, e);
+		}
+
+		if (have < bits) {
+			drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+				have, bits, look_ahead,
+				(unsigned int)(bs.cur.b - p->code),
+				(unsigned int)bs.buf_len);
+			return -EIO;
+		}
+		/* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
+		if (likely(bits < 64))
+			look_ahead >>= bits;
+		else
+			look_ahead = 0;
+		have -= bits;
+
+		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+		if (bits < 0)
+			return -EIO;
+		look_ahead |= tmp << have;
+		have += bits;
+	}
+
+	c->bit_offset = s;
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	return (s != c->bm_bits);
+}
+
+/**
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+decode_bitmap_c(struct drbd_peer_device *peer_device,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c,
+		unsigned int len)
+{
+	if (dcbp_get_code(p) == RLE_VLI_Bits)
+		return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
+
+	/* other variants had been implemented for evaluation,
+	 * but have been dropped as this one turned out to be "best"
+	 * during all our tests. */
+
+	drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+	conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+	return -EIO;
+}
+
+void INFO_bm_xfer_stats(struct drbd_device *device,
+		const char *direction, struct bm_xfer_ctx *c)
+{
+	/* what would it take to transfer it "plaintext" */
+	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+	unsigned int plain =
+		header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+		c->bm_words * sizeof(unsigned long);
+	unsigned int total = c->bytes[0] + c->bytes[1];
+	unsigned int r;
+
+	/* total can not be zero. but just in case: */
+	if (total == 0)
+		return;
+
+	/* don't report if not compressed */
+	if (total >= plain)
+		return;
+
+	/* total < plain. check for overflow, still */
+	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+		                    : (1000 * total / plain);
+
+	if (r > 1000)
+		r = 1000;
+
+	r = 1000 - r;
+	drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+	     "total %u; compression: %u.%u%%\n",
+			direction,
+			c->bytes[1], c->packets[1],
+			c->bytes[0], c->packets[0],
+			total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+   it does not matter if the process it in 32 bit chunks or 64 bit
+   chunks as long as it is little endian. (Understand it as byte stream,
+   beginning with the lowest byte...) If we would use big endian
+   we would need to process it from the highest address to the lowest,
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct bm_xfer_ctx c;
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
+	/* you are supposed to send additional out-of-sync information
+	 * if you actually set bits during this phase */
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(device),
+		.bm_words = drbd_bm_words(device),
+	};
+
+	for(;;) {
+		if (pi->cmd == P_BITMAP)
+			err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
+		else if (pi->cmd == P_COMPRESSED_BITMAP) {
+			/* MAYBE: sanity check that we speak proto >= 90,
+			 * and the feature is enabled! */
+			struct p_compressed_bm *p = pi->data;
+
+			if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
+				drbd_err(device, "ReportCBitmap packet too large\n");
+				err = -EIO;
+				goto out;
+			}
+			if (pi->size <= sizeof(*p)) {
+				drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+				err = -EIO;
+				goto out;
+			}
+			err = drbd_recv_all(peer_device->connection, p, pi->size);
+			if (err)
+			       goto out;
+			err = decode_bitmap_c(peer_device, p, &c, pi->size);
+		} else {
+			drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+			err = -EIO;
+			goto out;
+		}
+
+		c.packets[pi->cmd == P_BITMAP]++;
+		c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
+
+		if (err <= 0) {
+			if (err < 0)
+				goto out;
+			break;
+		}
+		err = drbd_recv_header(peer_device->connection, pi);
+		if (err)
+			goto out;
+	}
+
+	INFO_bm_xfer_stats(device, "receive", &c);
+
+	if (device->state.conn == C_WF_BITMAP_T) {
+		enum drbd_state_rv rv;
+
+		err = drbd_send_bitmap(device);
+		if (err)
+			goto out;
+		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+		rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		D_ASSERT(device, rv == SS_SUCCESS);
+	} else if (device->state.conn != C_WF_BITMAP_S) {
+		/* admin may have requested C_DISCONNECTING,
+		 * other threads may have noticed network errors */
+		drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
+		    drbd_conn_str(device->state.conn));
+	}
+	err = 0;
+
+ out:
+	drbd_bm_unlock(device);
+	if (!err && device->state.conn == C_WF_BITMAP_S)
+		drbd_start_resync(device, C_SYNC_SOURCE);
+	return err;
+}
+
+static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+	drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
+		 pi->cmd, pi->size);
+
+	return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
+{
+	/* Make sure we've acked all the TCP data associated
+	 * with the data requests being unplugged */
+	drbd_tcp_quickack(connection->data.socket);
+
+	return 0;
+}
+
+static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_desc *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	switch (device->state.conn) {
+	case C_WF_SYNC_UUID:
+	case C_WF_BITMAP_T:
+	case C_BEHIND:
+			break;
+	default:
+		drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
+				drbd_conn_str(device->state.conn));
+	}
+
+	drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+	return 0;
+}
+
+struct data_cmd {
+	int expect_payload;
+	size_t pkt_size;
+	int (*fn)(struct drbd_connection *, struct packet_info *);
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+	[P_DATA]	    = { 1, sizeof(struct p_data), receive_Data },
+	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
+	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+	[P_BITMAP]	    = { 1, 0, receive_bitmap } ,
+	[P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+	[P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
+	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_SYNC_PARAM]	    = { 1, 0, receive_SyncParam },
+	[P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
+	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
+	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
+	[P_STATE]	    = { 0, sizeof(struct p_state), receive_state },
+	[P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
+	[P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
+	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
+	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
+};
+
+static void drbdd(struct drbd_connection *connection)
+{
+	struct packet_info pi;
+	size_t shs; /* sub header size */
+	int err;
+
+	while (get_t_state(&connection->receiver) == RUNNING) {
+		struct data_cmd *cmd;
+
+		drbd_thread_current_set_cpu(&connection->receiver);
+		update_receiver_timing_details(connection, drbd_recv_header);
+		if (drbd_recv_header(connection, &pi))
+			goto err_out;
+
+		cmd = &drbd_cmd_handler[pi.cmd];
+		if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+			drbd_err(connection, "Unexpected data packet %s (0x%04x)",
+				 cmdname(pi.cmd), pi.cmd);
+			goto err_out;
+		}
+
+		shs = cmd->pkt_size;
+		if (pi.size > shs && !cmd->expect_payload) {
+			drbd_err(connection, "No payload expected %s l:%d\n",
+				 cmdname(pi.cmd), pi.size);
+			goto err_out;
+		}
+
+		if (shs) {
+			update_receiver_timing_details(connection, drbd_recv_all_warn);
+			err = drbd_recv_all_warn(connection, pi.data, shs);
+			if (err)
+				goto err_out;
+			pi.size -= shs;
+		}
+
+		update_receiver_timing_details(connection, cmd->fn);
+		err = cmd->fn(connection, &pi);
+		if (err) {
+			drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
+				 cmdname(pi.cmd), err, pi.size);
+			goto err_out;
+		}
+	}
+	return;
+
+    err_out:
+	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+static void conn_disconnect(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	enum drbd_conns oc;
+	int vnr;
+
+	if (connection->cstate == C_STANDALONE)
+		return;
+
+	/* We are about to start the cleanup after connection loss.
+	 * Make sure drbd_make_request knows about that.
+	 * Usually we should be in some network failure state already,
+	 * but just in case we are not, we fix it up here.
+	 */
+	conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+
+	/* asender does not clean up anything. it must not interfere, either */
+	drbd_thread_stop(&connection->asender);
+	drbd_free_sock(connection);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_disconnected(peer_device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	if (!list_empty(&connection->current_epoch->list))
+		drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
+	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+	atomic_set(&connection->current_epoch->epoch_size, 0);
+	connection->send.seen_any_write_yet = false;
+
+	drbd_info(connection, "Connection closed\n");
+
+	if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
+		conn_try_outdate_peer_async(connection);
+
+	spin_lock_irq(&connection->resource->req_lock);
+	oc = connection->cstate;
+	if (oc >= C_UNCONNECTED)
+		_conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	if (oc == C_DISCONNECTING)
+		conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+}
+
+static int drbd_disconnected(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	unsigned int i;
+
+	/* wait for current activity to cease. */
+	spin_lock_irq(&device->resource->req_lock);
+	_drbd_wait_ee_list_empty(device, &device->active_ee);
+	_drbd_wait_ee_list_empty(device, &device->sync_ee);
+	_drbd_wait_ee_list_empty(device, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	/* We do not have data structures that would allow us to
+	 * get the rs_pending_cnt down to 0 again.
+	 *  * On C_SYNC_TARGET we do not have any data structures describing
+	 *    the pending RSDataRequest's we have sent.
+	 *  * On C_SYNC_SOURCE there is no data structure that tracks
+	 *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+	 *  And no, it is not the sum of the reference counts in the
+	 *  resync_LRU. The resync_LRU tracks the whole operation including
+	 *  the disk-IO, while the rs_pending_cnt only tracks the blocks
+	 *  on the fly. */
+	drbd_rs_cancel_all(device);
+	device->rs_total = 0;
+	device->rs_failed = 0;
+	atomic_set(&device->rs_pending_cnt, 0);
+	wake_up(&device->misc_wait);
+
+	del_timer_sync(&device->resync_timer);
+	resync_timer_fn((unsigned long)device);
+
+	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+	 * w_make_resync_request etc. which may still be on the worker queue
+	 * to be "canceled" */
+	drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+	drbd_finish_peer_reqs(device);
+
+	/* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+	   might have issued a work again. The one before drbd_finish_peer_reqs() is
+	   necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+	drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+	/* need to do it again, drbd_finish_peer_reqs() may have populated it
+	 * again via drbd_try_clear_on_disk_bm(). */
+	drbd_rs_cancel_all(device);
+
+	kfree(device->p_uuid);
+	device->p_uuid = NULL;
+
+	if (!drbd_suspended(device))
+		tl_clear(peer_device->connection);
+
+	drbd_md_sync(device);
+
+	/* serialize with bitmap writeout triggered by the state change,
+	 * if any. */
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+
+	/* tcp_close and release of sendpage pages can be deferred.  I don't
+	 * want to use SO_LINGER, because apparently it can be deferred for
+	 * more than 20 seconds (longest time I checked).
+	 *
+	 * Actually we don't care for exactly when the network stack does its
+	 * put_page(), but release our reference on these pages right here.
+	 */
+	i = drbd_free_peer_reqs(device, &device->net_ee);
+	if (i)
+		drbd_info(device, "net_ee not empty, killed %u entries\n", i);
+	i = atomic_read(&device->pp_in_use_by_net);
+	if (i)
+		drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
+	i = atomic_read(&device->pp_in_use);
+	if (i)
+		drbd_info(device, "pp_in_use = %d, expected 0\n", i);
+
+	D_ASSERT(device, list_empty(&device->read_ee));
+	D_ASSERT(device, list_empty(&device->active_ee));
+	D_ASSERT(device, list_empty(&device->sync_ee));
+	D_ASSERT(device, list_empty(&device->done_ee));
+
+	return 0;
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+static int drbd_send_features(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+	struct p_connection_features *p;
+
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	memset(p, 0, sizeof(*p));
+	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	p->feature_flags = cpu_to_be32(PRO_FEATURES);
+	return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ */
+static int drbd_do_features(struct drbd_connection *connection)
+{
+	/* ASSERT current == connection->receiver ... */
+	struct p_connection_features *p;
+	const int expect = sizeof(struct p_connection_features);
+	struct packet_info pi;
+	int err;
+
+	err = drbd_send_features(connection);
+	if (err)
+		return 0;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err)
+		return 0;
+
+	if (pi.cmd != P_CONNECTION_FEATURES) {
+		drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		return -1;
+	}
+
+	if (pi.size != expect) {
+		drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
+		     expect, pi.size);
+		return -1;
+	}
+
+	p = pi.data;
+	err = drbd_recv_all_warn(connection, p, expect);
+	if (err)
+		return 0;
+
+	p->protocol_min = be32_to_cpu(p->protocol_min);
+	p->protocol_max = be32_to_cpu(p->protocol_max);
+	if (p->protocol_max == 0)
+		p->protocol_max = p->protocol_min;
+
+	if (PRO_VERSION_MAX < p->protocol_min ||
+	    PRO_VERSION_MIN > p->protocol_max)
+		goto incompat;
+
+	connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+	connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
+
+	drbd_info(connection, "Handshake successful: "
+	     "Agreed network protocol version %d\n", connection->agreed_pro_version);
+
+	drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
+		  connection->agreed_features & FF_TRIM ? " " : " not ");
+
+	return 1;
+
+ incompat:
+	drbd_err(connection, "incompatible DRBD dialects: "
+	    "I support %d-%d, peer supports %d-%d\n",
+	    PRO_VERSION_MIN, PRO_VERSION_MAX,
+	    p->protocol_min, p->protocol_max);
+	return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+	drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+	drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+	return -1;
+}
+#else
+#define CHALLENGE_LEN 64
+
+/* Return value:
+	1 - auth succeeded,
+	0 - failed, try again (network error),
+	-1 - auth failed, don't try again.
+*/
+
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
+	struct scatterlist sg;
+	char *response = NULL;
+	char *right_response = NULL;
+	char *peers_ch = NULL;
+	unsigned int key_len;
+	char secret[SHARED_SECRET_MAX]; /* 64 byte */
+	unsigned int resp_size;
+	struct hash_desc desc;
+	struct packet_info pi;
+	struct net_conf *nc;
+	int err, rv;
+
+	/* FIXME: Put the challenge/response into the preallocated socket buffer.  */
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	key_len = strlen(nc->shared_secret);
+	memcpy(secret, nc->shared_secret, key_len);
+	rcu_read_unlock();
+
+	desc.tfm = connection->cram_hmac_tfm;
+	desc.flags = 0;
+
+	rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
+	if (rv) {
+		drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+	sock = &connection->data;
+	if (!conn_prepare_command(connection, sock)) {
+		rv = 0;
+		goto fail;
+	}
+	rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
+				my_challenge, CHALLENGE_LEN);
+	if (!rv)
+		goto fail;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.cmd != P_AUTH_CHALLENGE) {
+		drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.size > CHALLENGE_LEN * 2) {
+		drbd_err(connection, "expected AuthChallenge payload too big.\n");
+		rv = -1;
+		goto fail;
+	}
+
+	if (pi.size < CHALLENGE_LEN) {
+		drbd_err(connection, "AuthChallenge payload too small.\n");
+		rv = -1;
+		goto fail;
+	}
+
+	peers_ch = kmalloc(pi.size, GFP_NOIO);
+	if (peers_ch == NULL) {
+		drbd_err(connection, "kmalloc of peers_ch failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	err = drbd_recv_all_warn(connection, peers_ch, pi.size);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+		drbd_err(connection, "Peer presented the same challenge!\n");
+		rv = -1;
+		goto fail;
+	}
+
+	resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
+	response = kmalloc(resp_size, GFP_NOIO);
+	if (response == NULL) {
+		drbd_err(connection, "kmalloc of response failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	sg_init_table(&sg, 1);
+	sg_set_buf(&sg, peers_ch, pi.size);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, response);
+	if (rv) {
+		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	if (!conn_prepare_command(connection, sock)) {
+		rv = 0;
+		goto fail;
+	}
+	rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
+				response, resp_size);
+	if (!rv)
+		goto fail;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.cmd != P_AUTH_RESPONSE) {
+		drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.size != resp_size) {
+		drbd_err(connection, "expected AuthResponse payload of wrong size\n");
+		rv = 0;
+		goto fail;
+	}
+
+	err = drbd_recv_all_warn(connection, response , resp_size);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	right_response = kmalloc(resp_size, GFP_NOIO);
+	if (right_response == NULL) {
+		drbd_err(connection, "kmalloc of right_response failed\n");
+		rv = -1;
+		goto fail;
+	}
+
+	sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
+	if (rv) {
+		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	rv = !memcmp(response, right_response, resp_size);
+
+	if (rv)
+		drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
+		     resp_size);
+	else
+		rv = -1;
+
+ fail:
+	kfree(peers_ch);
+	kfree(response);
+	kfree(right_response);
+
+	return rv;
+}
+#endif
+
+int drbd_receiver(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	int h;
+
+	drbd_info(connection, "receiver (re)started\n");
+
+	do {
+		h = conn_connect(connection);
+		if (h == 0) {
+			conn_disconnect(connection);
+			schedule_timeout_interruptible(HZ);
+		}
+		if (h == -1) {
+			drbd_warn(connection, "Discarding network configuration.\n");
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	} while (h == 0);
+
+	if (h > 0)
+		drbdd(connection);
+
+	conn_disconnect(connection);
+
+	drbd_info(connection, "receiver terminated\n");
+	return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_req_state_reply *p = pi->data;
+	int retcode = be32_to_cpu(p->retcode);
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
+	} else {
+		set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
+		drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
+			 drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&connection->ping_wait);
+
+	return 0;
+}
+
+static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_req_state_reply *p = pi->data;
+	int retcode = be32_to_cpu(p->retcode);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
+		D_ASSERT(device, connection->agreed_pro_version < 100);
+		return got_conn_RqSReply(connection, pi);
+	}
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CL_ST_CHG_SUCCESS, &device->flags);
+	} else {
+		set_bit(CL_ST_CHG_FAIL, &device->flags);
+		drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
+			drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&device->state_wait);
+
+	return 0;
+}
+
+static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
+{
+	return drbd_send_ping_ack(connection);
+
+}
+
+static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	/* restore idle timeout */
+	connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
+	if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
+		wake_up(&connection->ping_wait);
+
+	return 0;
+}
+
+static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, sector);
+		drbd_set_in_sync(device, sector, blksize);
+		/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+		device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+		put_ldev(device);
+	}
+	dec_rs_pending(device);
+	atomic_add(blksize >> 9, &device->rs_sect_in);
+
+	return 0;
+}
+
+static int
+validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
+			      struct rb_root *root, const char *func,
+			      enum drbd_req_event what, bool missing_ok)
+{
+	struct drbd_request *req;
+	struct bio_and_error m;
+
+	spin_lock_irq(&device->resource->req_lock);
+	req = find_request(device, root, id, sector, missing_ok, func);
+	if (unlikely(!req)) {
+		spin_unlock_irq(&device->resource->req_lock);
+		return -EIO;
+	}
+	__req_mod(req, what, &m);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+	return 0;
+}
+
+static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+	enum drbd_req_event what;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (p->block_id == ID_SYNCER) {
+		drbd_set_in_sync(device, sector, blksize);
+		dec_rs_pending(device);
+		return 0;
+	}
+	switch (pi->cmd) {
+	case P_RS_WRITE_ACK:
+		what = WRITE_ACKED_BY_PEER_AND_SIS;
+		break;
+	case P_WRITE_ACK:
+		what = WRITE_ACKED_BY_PEER;
+		break;
+	case P_RECV_ACK:
+		what = RECV_ACKED_BY_PEER;
+		break;
+	case P_SUPERSEDED:
+		what = CONFLICT_RESOLVED;
+		break;
+	case P_RETRY_WRITE:
+		what = POSTPONE_WRITE;
+		break;
+	default:
+		BUG();
+	}
+
+	return validate_req_change_req_state(device, p->block_id, sector,
+					     &device->write_requests, __func__,
+					     what, false);
+}
+
+static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int size = be32_to_cpu(p->blksize);
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (p->block_id == ID_SYNCER) {
+		dec_rs_pending(device);
+		drbd_rs_failed_io(device, sector, size);
+		return 0;
+	}
+
+	err = validate_req_change_req_state(device, p->block_id, sector,
+					    &device->write_requests, __func__,
+					    NEG_ACKED, true);
+	if (err) {
+		/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+		   The master bio might already be completed, therefore the
+		   request is no longer in the collision hash. */
+		/* In Protocol B we might already have got a P_RECV_ACK
+		   but then get a P_NEG_ACK afterwards. */
+		drbd_set_out_of_sync(device, sector, size);
+	}
+	return 0;
+}
+
+static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
+	    (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+	return validate_req_change_req_state(device, p->block_id, sector,
+					     &device->read_requests, __func__,
+					     NEG_ACKED, false);
+}
+
+static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	int size;
+	struct p_block_ack *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	dec_rs_pending(device);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		drbd_rs_complete_io(device, sector);
+		switch (pi->cmd) {
+		case P_NEG_RS_DREPLY:
+			drbd_rs_failed_io(device, sector, size);
+		case P_RS_CANCEL:
+			break;
+		default:
+			BUG();
+		}
+		put_ldev(device);
+	}
+
+	return 0;
+}
+
+static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_barrier_ack *p = pi->data;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		if (device->state.conn == C_AHEAD &&
+		    atomic_read(&device->ap_in_flight) == 0 &&
+		    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
+			device->start_resync_timer.expires = jiffies + HZ;
+			add_timer(&device->start_resync_timer);
+		}
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	struct drbd_device_work *dw;
+	sector_t sector;
+	int size;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+		drbd_ov_out_of_sync_found(device, sector, size);
+	else
+		ov_out_of_sync_print(device);
+
+	if (!get_ldev(device))
+		return 0;
+
+	drbd_rs_complete_io(device, sector);
+	dec_rs_pending(device);
+
+	--device->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((device->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(device, device->ov_left);
+
+	if (device->ov_left == 0) {
+		dw = kmalloc(sizeof(*dw), GFP_NOIO);
+		if (dw) {
+			dw->w.cb = w_ov_finished;
+			dw->device = device;
+			drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
+		} else {
+			drbd_err(device, "kmalloc(dw) failed.");
+			ov_out_of_sync_print(device);
+			drbd_resync_finished(device);
+		}
+	}
+	put_ldev(device);
+	return 0;
+}
+
+static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+	return 0;
+}
+
+static int connection_finish_peer_reqs(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr, not_empty = 0;
+
+	do {
+		clear_bit(SIGNAL_ASENDER, &connection->flags);
+		flush_signals(current);
+
+		rcu_read_lock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+			kref_get(&device->kref);
+			rcu_read_unlock();
+			if (drbd_finish_peer_reqs(device)) {
+				kref_put(&device->kref, drbd_destroy_device);
+				return 1;
+			}
+			kref_put(&device->kref, drbd_destroy_device);
+			rcu_read_lock();
+		}
+		set_bit(SIGNAL_ASENDER, &connection->flags);
+
+		spin_lock_irq(&connection->resource->req_lock);
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+			not_empty = !list_empty(&device->done_ee);
+			if (not_empty)
+				break;
+		}
+		spin_unlock_irq(&connection->resource->req_lock);
+		rcu_read_unlock();
+	} while (not_empty);
+
+	return 0;
+}
+
+struct asender_cmd {
+	size_t pkt_size;
+	int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
+static struct asender_cmd asender_tbl[] = {
+	[P_PING]	    = { 0, got_Ping },
+	[P_PING_ACK]	    = { 0, got_PingAck },
+	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
+	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
+	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
+	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
+	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
+	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+	[P_RETRY_WRITE]	    = { sizeof(struct p_block_ack), got_BlockAck },
+};
+
+int drbd_asender(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	struct asender_cmd *cmd = NULL;
+	struct packet_info pi;
+	int rv;
+	void *buf    = connection->meta.rbuf;
+	int received = 0;
+	unsigned int header_size = drbd_header_size(connection);
+	int expect   = header_size;
+	bool ping_timeout_active = false;
+	struct net_conf *nc;
+	int ping_timeo, tcp_cork, ping_int;
+	struct sched_param param = { .sched_priority = 2 };
+
+	rv = sched_setscheduler(current, SCHED_RR, &param);
+	if (rv < 0)
+		drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+
+	while (get_t_state(thi) == RUNNING) {
+		drbd_thread_current_set_cpu(thi);
+
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+		ping_timeo = nc->ping_timeo;
+		tcp_cork = nc->tcp_cork;
+		ping_int = nc->ping_int;
+		rcu_read_unlock();
+
+		if (test_and_clear_bit(SEND_PING, &connection->flags)) {
+			if (drbd_send_ping(connection)) {
+				drbd_err(connection, "drbd_send_ping has failed\n");
+				goto reconnect;
+			}
+			connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+			ping_timeout_active = true;
+		}
+
+		/* TODO: conditionally cork; it may hurt latency if we cork without
+		   much to send */
+		if (tcp_cork)
+			drbd_tcp_cork(connection->meta.socket);
+		if (connection_finish_peer_reqs(connection)) {
+			drbd_err(connection, "connection_finish_peer_reqs() failed\n");
+			goto reconnect;
+		}
+		/* but unconditionally uncork unless disabled */
+		if (tcp_cork)
+			drbd_tcp_uncork(connection->meta.socket);
+
+		/* short circuit, recv_msg would return EINTR anyways. */
+		if (signal_pending(current))
+			continue;
+
+		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
+		clear_bit(SIGNAL_ASENDER, &connection->flags);
+
+		flush_signals(current);
+
+		/* Note:
+		 * -EINTR	 (on meta) we got a signal
+		 * -EAGAIN	 (on meta) rcvtimeo expired
+		 * -ECONNRESET	 other side closed the connection
+		 * -ERESTARTSYS  (on data) we got a signal
+		 * rv <  0	 other than above: unexpected error!
+		 * rv == expected: full header or command
+		 * rv <  expected: "woken" by signal during receive
+		 * rv == 0	 : "connection shut down by peer"
+		 */
+received_more:
+		if (likely(rv > 0)) {
+			received += rv;
+			buf	 += rv;
+		} else if (rv == 0) {
+			if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+				long t;
+				rcu_read_lock();
+				t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+				rcu_read_unlock();
+
+				t = wait_event_timeout(connection->ping_wait,
+						       connection->cstate < C_WF_REPORT_PARAMS,
+						       t);
+				if (t)
+					break;
+			}
+			drbd_err(connection, "meta connection shut down by peer.\n");
+			goto reconnect;
+		} else if (rv == -EAGAIN) {
+			/* If the data socket received something meanwhile,
+			 * that is good enough: peer is still alive. */
+			if (time_after(connection->last_received,
+				jiffies - connection->meta.socket->sk->sk_rcvtimeo))
+				continue;
+			if (ping_timeout_active) {
+				drbd_err(connection, "PingAck did not arrive in time.\n");
+				goto reconnect;
+			}
+			set_bit(SEND_PING, &connection->flags);
+			continue;
+		} else if (rv == -EINTR) {
+			continue;
+		} else {
+			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+			goto reconnect;
+		}
+
+		if (received == expect && cmd == NULL) {
+			if (decode_header(connection, connection->meta.rbuf, &pi))
+				goto reconnect;
+			cmd = &asender_tbl[pi.cmd];
+			if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+				drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
+					 cmdname(pi.cmd), pi.cmd);
+				goto disconnect;
+			}
+			expect = header_size + cmd->pkt_size;
+			if (pi.size != expect - header_size) {
+				drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
+					pi.cmd, pi.size);
+				goto reconnect;
+			}
+		}
+		if (received == expect) {
+			bool err;
+
+			err = cmd->fn(connection, &pi);
+			if (err) {
+				drbd_err(connection, "%pf failed\n", cmd->fn);
+				goto reconnect;
+			}
+
+			connection->last_received = jiffies;
+
+			if (cmd == &asender_tbl[P_PING_ACK]) {
+				/* restore idle timeout */
+				connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+				ping_timeout_active = false;
+			}
+
+			buf	 = connection->meta.rbuf;
+			received = 0;
+			expect	 = header_size;
+			cmd	 = NULL;
+		}
+		if (test_bit(SEND_PING, &connection->flags))
+			continue;
+		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
+		if (rv > 0)
+			goto received_more;
+	}
+
+	if (0) {
+reconnect:
+		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		conn_md_sync(connection);
+	}
+	if (0) {
+disconnect:
+		conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	}
+	clear_bit(SIGNAL_ASENDER, &connection->flags);
+
+	drbd_info(connection, "asender terminated\n");
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000..3907202fb
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1638 @@
+/*
+   drbd_req.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size);
+
+/* Update disk stats at start of I/O request */
+static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req)
+{
+	generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9,
+			      &device->vdisk->part0);
+}
+
+/* Update disk stats when completing request upwards */
+static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
+{
+	generic_end_io_acct(bio_data_dir(req->master_bio),
+			    &device->vdisk->part0, req->start_jif);
+}
+
+static struct drbd_request *drbd_req_new(struct drbd_device *device,
+					       struct bio *bio_src)
+{
+	struct drbd_request *req;
+
+	req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	if (!req)
+		return NULL;
+	memset(req, 0, sizeof(*req));
+
+	drbd_req_make_private_bio(req, bio_src);
+	req->rq_state    = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+	req->device   = device;
+	req->master_bio  = bio_src;
+	req->epoch       = 0;
+
+	drbd_clear_interval(&req->i);
+	req->i.sector     = bio_src->bi_iter.bi_sector;
+	req->i.size      = bio_src->bi_iter.bi_size;
+	req->i.local = true;
+	req->i.waiting = false;
+
+	INIT_LIST_HEAD(&req->tl_requests);
+	INIT_LIST_HEAD(&req->w.list);
+	INIT_LIST_HEAD(&req->req_pending_master_completion);
+	INIT_LIST_HEAD(&req->req_pending_local);
+
+	/* one reference to be put by __drbd_make_request */
+	atomic_set(&req->completion_ref, 1);
+	/* one kref as long as completion_ref > 0 */
+	kref_init(&req->kref);
+	return req;
+}
+
+static void drbd_remove_request_interval(struct rb_root *root,
+					 struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	struct drbd_interval *i = &req->i;
+
+	drbd_remove_interval(root, i);
+
+	/* Wake up any processes waiting for this request to complete.  */
+	if (i->waiting)
+		wake_up(&device->misc_wait);
+}
+
+void drbd_req_destroy(struct kref *kref)
+{
+	struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+	struct drbd_device *device = req->device;
+	const unsigned s = req->rq_state;
+
+	if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+		atomic_read(&req->completion_ref) ||
+		(s & RQ_LOCAL_PENDING) ||
+		((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+		drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+				s, atomic_read(&req->completion_ref));
+		return;
+	}
+
+	/* If called from mod_rq_state (expected normal case) or
+	 * drbd_send_and_submit (the less likely normal path), this holds the
+	 * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+	 * though it may be still empty (never added to the transfer log).
+	 *
+	 * If called from do_retry(), we do NOT hold the req_lock, but we are
+	 * still allowed to unconditionally list_del(&req->tl_requests),
+	 * because it will be on a local on-stack list only. */
+	list_del_init(&req->tl_requests);
+
+	/* finally remove the request from the conflict detection
+	 * respective block_id verification interval tree. */
+	if (!drbd_interval_empty(&req->i)) {
+		struct rb_root *root;
+
+		if (s & RQ_WRITE)
+			root = &device->write_requests;
+		else
+			root = &device->read_requests;
+		drbd_remove_request_interval(root, req);
+	} else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
+		drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
+			s, (unsigned long long)req->i.sector, req->i.size);
+
+	/* if it was a write, we may have to set the corresponding
+	 * bit(s) out-of-sync first. If it had a local part, we need to
+	 * release the reference to the activity log. */
+	if (s & RQ_WRITE) {
+		/* Set out-of-sync unless both OK flags are set
+		 * (local only or remote failed).
+		 * Other places where we set out-of-sync:
+		 * READ with local io-error */
+
+		/* There is a special case:
+		 * we may notice late that IO was suspended,
+		 * and postpone, or schedule for retry, a write,
+		 * before it even was submitted or sent.
+		 * In that case we do not want to touch the bitmap at all.
+		 */
+		if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+			if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+				drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+
+			if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+				drbd_set_in_sync(device, req->i.sector, req->i.size);
+		}
+
+		/* one might be tempted to move the drbd_al_complete_io
+		 * to the local io completion callback drbd_request_endio.
+		 * but, if this was a mirror write, we may only
+		 * drbd_al_complete_io after this is RQ_NET_DONE,
+		 * otherwise the extent could be dropped from the al
+		 * before it has actually been written on the peer.
+		 * if we crash before our peer knows about the request,
+		 * but after the extent has been dropped from the al,
+		 * we would forget to resync the corresponding extent.
+		 */
+		if (s & RQ_IN_ACT_LOG) {
+			if (get_ldev_if_state(device, D_FAILED)) {
+				drbd_al_complete_io(device, &req->i);
+				put_ldev(device);
+			} else if (__ratelimit(&drbd_ratelimit_state)) {
+				drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
+					 "but my Disk seems to have failed :(\n",
+					 (unsigned long long) req->i.sector, req->i.size);
+			}
+		}
+	}
+
+	mempool_free(req, drbd_request_mempool);
+}
+
+static void wake_all_senders(struct drbd_connection *connection)
+{
+	wake_up(&connection->sender_work.q_wait);
+}
+
+/* must hold resource->req_lock */
+void start_new_tl_epoch(struct drbd_connection *connection)
+{
+	/* no point closing an epoch, if it is empty, anyways. */
+	if (connection->current_tle_writes == 0)
+		return;
+
+	connection->current_tle_writes = 0;
+	atomic_inc(&connection->current_tle_nr);
+	wake_all_senders(connection);
+}
+
+void complete_master_bio(struct drbd_device *device,
+		struct bio_and_error *m)
+{
+	bio_endio(m->bio, m->error);
+	dec_ap_bio(device);
+}
+
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+static
+void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
+{
+	const unsigned s = req->rq_state;
+	struct drbd_device *device = req->device;
+	int rw;
+	int error, ok;
+
+	/* we must not complete the master bio, while it is
+	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+	 *	not yet acknowledged by the peer
+	 *	not yet completed by the local io subsystem
+	 * these flags may get cleared in any order by
+	 *	the worker,
+	 *	the receiver,
+	 *	the bio_endio completion callbacks.
+	 */
+	if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+	    (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+	    (s & RQ_COMPLETION_SUSP)) {
+		drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
+		return;
+	}
+
+	if (!req->master_bio) {
+		drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
+		return;
+	}
+
+	rw = bio_rw(req->master_bio);
+
+	/*
+	 * figure out whether to report success or failure.
+	 *
+	 * report success when at least one of the operations succeeded.
+	 * or, to put the other way,
+	 * only report failure, when both operations failed.
+	 *
+	 * what to do about the failures is handled elsewhere.
+	 * what we need to do here is just: complete the master_bio.
+	 *
+	 * local completion error, if any, has been stored as ERR_PTR
+	 * in private_bio within drbd_request_endio.
+	 */
+	ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+	error = PTR_ERR(req->private_bio);
+
+	/* Before we can signal completion to the upper layers,
+	 * we may need to close the current transfer log epoch.
+	 * We are within the request lock, so we can simply compare
+	 * the request epoch number with the current transfer log
+	 * epoch number.  If they match, increase the current_tle_nr,
+	 * and reset the transfer log epoch write_cnt.
+	 */
+	if (rw == WRITE &&
+	    req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
+		start_new_tl_epoch(first_peer_device(device)->connection);
+
+	/* Update disk stats */
+	_drbd_end_io_acct(device, req);
+
+	/* If READ failed,
+	 * have it be pushed back to the retry work queue,
+	 * so it will re-enter __drbd_make_request(),
+	 * and be re-assigned to a suitable local or remote path,
+	 * or failed if we do not have access to good data anymore.
+	 *
+	 * Unless it was failed early by __drbd_make_request(),
+	 * because no path was available, in which case
+	 * it was not even added to the transfer_log.
+	 *
+	 * READA may fail, and will not be retried.
+	 *
+	 * WRITE should have used all available paths already.
+	 */
+	if (!ok && rw == READ && !list_empty(&req->tl_requests))
+		req->rq_state |= RQ_POSTPONED;
+
+	if (!(req->rq_state & RQ_POSTPONED)) {
+		m->error = ok ? 0 : (error ?: -EIO);
+		m->bio = req->master_bio;
+		req->master_bio = NULL;
+		/* We leave it in the tree, to be able to verify later
+		 * write-acks in protocol != C during resync.
+		 * But we mark it as "complete", so it won't be counted as
+		 * conflict in a multi-primary setup. */
+		req->i.completed = true;
+	}
+
+	if (req->i.waiting)
+		wake_up(&device->misc_wait);
+
+	/* Either we are about to complete to upper layers,
+	 * or we will restart this request.
+	 * In either case, the request object will be destroyed soon,
+	 * so better remove it from all lists. */
+	list_del_init(&req->req_pending_master_completion);
+}
+
+/* still holds resource->req_lock */
+static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
+{
+	struct drbd_device *device = req->device;
+	D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED));
+
+	if (!atomic_sub_and_test(put, &req->completion_ref))
+		return 0;
+
+	drbd_req_complete(req, m);
+
+	if (req->rq_state & RQ_POSTPONED) {
+		/* don't destroy the req object just yet,
+		 * but queue it for retry */
+		drbd_restart_request(req);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_next == NULL)
+		connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_next != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if (s & RQ_NET_QUEUED)
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_ack_pending == NULL)
+		connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_ack_pending != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_not_net_done == NULL)
+		connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_not_net_done != req)
+		return;
+	list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+		const unsigned s = req->rq_state;
+		if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
+			break;
+	}
+	if (&req->tl_requests == &connection->transfer_log)
+		req = NULL;
+	connection->req_not_net_done = req;
+}
+
+/* I'd like this to be the only place that manipulates
+ * req->completion_ref and req->kref. */
+static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+		int clear, int set)
+{
+	struct drbd_device *device = req->device;
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	unsigned s = req->rq_state;
+	int c_put = 0;
+	int k_put = 0;
+
+	if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
+		set |= RQ_COMPLETION_SUSP;
+
+	/* apply */
+
+	req->rq_state &= ~clear;
+	req->rq_state |= set;
+
+	/* no change? */
+	if (req->rq_state == s)
+		return;
+
+	/* intent: get references */
+
+	if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+		atomic_inc(&req->completion_ref);
+
+	if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+		inc_ap_pending(device);
+		atomic_inc(&req->completion_ref);
+	}
+
+	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
+		atomic_inc(&req->completion_ref);
+		set_if_null_req_next(peer_device, req);
+	}
+
+	if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+		kref_get(&req->kref); /* wait for the DONE */
+
+	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+		/* potentially already completed in the asender thread */
+		if (!(s & RQ_NET_DONE)) {
+			atomic_add(req->i.size >> 9, &device->ap_in_flight);
+			set_if_null_req_not_net_done(peer_device, req);
+		}
+		if (s & RQ_NET_PENDING)
+			set_if_null_req_ack_pending(peer_device, req);
+	}
+
+	if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+		atomic_inc(&req->completion_ref);
+
+	/* progress: put references */
+
+	if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+		++c_put;
+
+	if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+		D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
+		/* local completion may still come in later,
+		 * we need to keep the req object around. */
+		kref_get(&req->kref);
+		++c_put;
+	}
+
+	if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+		if (req->rq_state & RQ_LOCAL_ABORTED)
+			++k_put;
+		else
+			++c_put;
+		list_del_init(&req->req_pending_local);
+	}
+
+	if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+		dec_ap_pending(device);
+		++c_put;
+		req->acked_jif = jiffies;
+		advance_conn_req_ack_pending(peer_device, req);
+	}
+
+	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
+		++c_put;
+		advance_conn_req_next(peer_device, req);
+	}
+
+	if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+		if (s & RQ_NET_SENT)
+			atomic_sub(req->i.size >> 9, &device->ap_in_flight);
+		if (s & RQ_EXP_BARR_ACK)
+			++k_put;
+		req->net_done_jif = jiffies;
+
+		/* in ahead/behind mode, or just in case,
+		 * before we finally destroy this request,
+		 * the caching pointers must not reference it anymore */
+		advance_conn_req_next(peer_device, req);
+		advance_conn_req_ack_pending(peer_device, req);
+		advance_conn_req_not_net_done(peer_device, req);
+	}
+
+	/* potentially complete and destroy */
+
+	if (k_put || c_put) {
+		/* Completion does it's own kref_put.  If we are going to
+		 * kref_sub below, we need req to be still around then. */
+		int at_least = k_put + !!c_put;
+		int refcount = atomic_read(&req->kref.refcount);
+		if (refcount < at_least)
+			drbd_err(device,
+				"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
+				s, req->rq_state, refcount, at_least);
+	}
+
+	/* If we made progress, retry conflicting peer requests, if any. */
+	if (req->i.waiting)
+		wake_up(&device->misc_wait);
+
+	if (c_put)
+		k_put += drbd_req_put_completion_ref(req, m, c_put);
+	if (k_put)
+		kref_sub(&req->kref, k_put, drbd_req_destroy);
+}
+
+static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
+{
+        char b[BDEVNAME_SIZE];
+
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+
+	drbd_warn(device, "local %s IO error sector %llu+%u on %s\n",
+			(req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
+			(unsigned long long)req->i.sector,
+			req->i.size >> 9,
+			bdevname(device->ldev->backing_bdev, b));
+}
+
+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+	return (req->rq_state &
+		   (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+		== (RQ_WRITE|RQ_NET_PENDING);
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ *  enforces that it is all in this one place, where it is easier to audit,
+ *  it makes it obvious that whatever "event" "happens" to a request should
+ *  happen "atomically" within the req_lock,
+ *  and it enforces that we have to think in a very structured manner
+ *  about the "events" that may happen to a request during its life time ...
+ */
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m)
+{
+	struct drbd_device *const device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+	struct net_conf *nc;
+	int p, rv = 0;
+
+	if (m)
+		m->bio = NULL;
+
+	switch (what) {
+	default:
+		drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+		break;
+
+	/* does not happen...
+	 * initialization done in drbd_req_new
+	case CREATED:
+		break;
+		*/
+
+	case TO_BE_SENT: /* via network */
+		/* reached via __drbd_make_request
+		 * and from w_read_retry_remote */
+		D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+		p = nc->wire_protocol;
+		rcu_read_unlock();
+		req->rq_state |=
+			p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+			p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+		mod_rq_state(req, m, 0, RQ_NET_PENDING);
+		break;
+
+	case TO_BE_SUBMITTED: /* locally */
+		/* reached via __drbd_make_request */
+		D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK));
+		mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
+		break;
+
+	case COMPLETED_OK:
+		if (req->rq_state & RQ_WRITE)
+			device->writ_cnt += req->i.size >> 9;
+		else
+			device->read_cnt += req->i.size >> 9;
+
+		mod_rq_state(req, m, RQ_LOCAL_PENDING,
+				RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+		break;
+
+	case ABORT_DISK_IO:
+		mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
+		break;
+
+	case WRITE_COMPLETED_WITH_ERROR:
+		drbd_report_io_error(device, req);
+		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case READ_COMPLETED_WITH_ERROR:
+		drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+		drbd_report_io_error(device, req);
+		__drbd_chk_io_error(device, DRBD_READ_ERROR);
+		/* fall through. */
+	case READ_AHEAD_COMPLETED_WITH_ERROR:
+		/* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case DISCARD_COMPLETED_NOTSUPP:
+	case DISCARD_COMPLETED_WITH_ERROR:
+		/* I'd rather not detach from local disk just because it
+		 * failed a REQ_DISCARD. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case QUEUE_FOR_NET_READ:
+		/* READ or READA, and
+		 * no local disk,
+		 * or target area marked as invalid,
+		 * or just got an io-error. */
+		/* from __drbd_make_request
+		 * or from bio_endio during read io-error recovery */
+
+		/* So we can verify the handle in the answer packet.
+		 * Corresponding drbd_remove_request_interval is in
+		 * drbd_req_complete() */
+		D_ASSERT(device, drbd_interval_empty(&req->i));
+		drbd_insert_interval(&device->read_requests, &req->i);
+
+		set_bit(UNPLUG_REMOTE, &device->flags);
+
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+		req->w.cb = w_send_read_req;
+		drbd_queue_work(&connection->sender_work,
+				&req->w);
+		break;
+
+	case QUEUE_FOR_NET_WRITE:
+		/* assert something? */
+		/* from __drbd_make_request only */
+
+		/* Corresponding drbd_remove_request_interval is in
+		 * drbd_req_complete() */
+		D_ASSERT(device, drbd_interval_empty(&req->i));
+		drbd_insert_interval(&device->write_requests, &req->i);
+
+		/* NOTE
+		 * In case the req ended up on the transfer log before being
+		 * queued on the worker, it could lead to this request being
+		 * missed during cleanup after connection loss.
+		 * So we have to do both operations here,
+		 * within the same lock that protects the transfer log.
+		 *
+		 * _req_add_to_epoch(req); this has to be after the
+		 * _maybe_start_new_epoch(req); which happened in
+		 * __drbd_make_request, because we now may set the bit
+		 * again ourselves to close the current epoch.
+		 *
+		 * Add req to the (now) current epoch (barrier). */
+
+		/* otherwise we may lose an unplug, which may cause some remote
+		 * io-scheduler timeout to expire, increasing maximum latency,
+		 * hurting performance. */
+		set_bit(UNPLUG_REMOTE, &device->flags);
+
+		/* queue work item to send data */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
+		req->w.cb =  w_send_dblock;
+		drbd_queue_work(&connection->sender_work,
+				&req->w);
+
+		/* close the epoch, in case it outgrew the limit */
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+		p = nc->max_epoch_size;
+		rcu_read_unlock();
+		if (connection->current_tle_writes >= p)
+			start_new_tl_epoch(connection);
+
+		break;
+
+	case QUEUE_FOR_SEND_OOS:
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+		req->w.cb =  w_send_out_of_sync;
+		drbd_queue_work(&connection->sender_work,
+				&req->w);
+		break;
+
+	case READ_RETRY_REMOTE_CANCELED:
+	case SEND_CANCELED:
+	case SEND_FAILED:
+		/* real cleanup will be done from tl_clear.  just update flags
+		 * so it is no longer marked as on the worker queue */
+		mod_rq_state(req, m, RQ_NET_QUEUED, 0);
+		break;
+
+	case HANDED_OVER_TO_NETWORK:
+		/* assert something? */
+		if (is_pending_write_protocol_A(req))
+			/* this is what is dangerous about protocol A:
+			 * pretend it was successfully written on the peer. */
+			mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+						RQ_NET_SENT|RQ_NET_OK);
+		else
+			mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+		/* It is still not yet RQ_NET_DONE until the
+		 * corresponding epoch barrier got acked as well,
+		 * so we know what to dirty on connection loss. */
+		break;
+
+	case OOS_HANDED_TO_NETWORK:
+		/* Was not set PENDING, no longer QUEUED, so is now DONE
+		 * as far as this connection is concerned. */
+		mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
+		break;
+
+	case CONNECTION_LOST_WHILE_PENDING:
+		/* transfer log cleanup after connection loss */
+		mod_rq_state(req, m,
+				RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+				RQ_NET_DONE);
+		break;
+
+	case CONFLICT_RESOLVED:
+		/* for superseded conflicting writes of multiple primaries,
+		 * there is no need to keep anything in the tl, potential
+		 * node crashes are covered by the activity log.
+		 *
+		 * If this request had been marked as RQ_POSTPONED before,
+		 * it will actually not be completed, but "restarted",
+		 * resubmitted from the retry worker context. */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+		break;
+
+	case WRITE_ACKED_BY_PEER_AND_SIS:
+		req->rq_state |= RQ_NET_SIS;
+	case WRITE_ACKED_BY_PEER:
+		/* Normal operation protocol C: successfully written on peer.
+		 * During resync, even in protocol != C,
+		 * we requested an explicit write ack anyways.
+		 * Which means we cannot even assert anything here.
+		 * Nothing more to do here.
+		 * We want to keep the tl in place for all protocols, to cater
+		 * for volatile write-back caches on lower level devices. */
+		goto ack_common;
+	case RECV_ACKED_BY_PEER:
+		D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
+		/* protocol B; pretends to be successfully written on peer.
+		 * see also notes above in HANDED_OVER_TO_NETWORK about
+		 * protocol != C */
+	ack_common:
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+		break;
+
+	case POSTPONE_WRITE:
+		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+		/* If this node has already detected the write conflict, the
+		 * worker will be waiting on misc_wait.  Wake it up once this
+		 * request has completed locally.
+		 */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_POSTPONED;
+		if (req->i.waiting)
+			wake_up(&device->misc_wait);
+		/* Do not clear RQ_NET_PENDING. This request will make further
+		 * progress via restart_conflicting_writes() or
+		 * fail_postponed_requests(). Hopefully. */
+		break;
+
+	case NEG_ACKED:
+		mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
+		break;
+
+	case FAIL_FROZEN_DISK_IO:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+		mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+		break;
+
+	case RESTART_FROZEN_DISK_IO:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+
+		mod_rq_state(req, m,
+				RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+				RQ_LOCAL_PENDING);
+
+		rv = MR_READ;
+		if (bio_data_dir(req->master_bio) == WRITE)
+			rv = MR_WRITE;
+
+		get_ldev(device); /* always succeeds in this call path */
+		req->w.cb = w_restart_disk_io;
+		drbd_queue_work(&connection->sender_work,
+				&req->w);
+		break;
+
+	case RESEND:
+		/* Simply complete (local only) READs. */
+		if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+			mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+			break;
+		}
+
+		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+		   before the connection loss (B&C only); only P_BARRIER_ACK
+		   (or the local completion?) was missing when we suspended.
+		   Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+		   During connection handshake, we ensure that the peer was not rebooted. */
+		if (!(req->rq_state & RQ_NET_OK)) {
+			/* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync?
+			 * in that case we must not set RQ_NET_PENDING. */
+
+			mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
+			if (req->w.cb) {
+				/* w.cb expected to be w_send_dblock, or w_send_read_req */
+				drbd_queue_work(&connection->sender_work,
+						&req->w);
+				rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+			} /* else: FIXME can this happen? */
+			break;
+		}
+		/* else, fall through to BARRIER_ACKED */
+
+	case BARRIER_ACKED:
+		/* barrier ack for READ requests does not make sense */
+		if (!(req->rq_state & RQ_WRITE))
+			break;
+
+		if (req->rq_state & RQ_NET_PENDING) {
+			/* barrier came in before all requests were acked.
+			 * this is bad, because if the connection is lost now,
+			 * we won't be able to clean them up... */
+			drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n");
+		}
+		/* Allowed to complete requests, even while suspended.
+		 * As this is called for all requests within a matching epoch,
+		 * we need to filter, and only set RQ_NET_DONE for those that
+		 * have actually been on the wire. */
+		mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+				(req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
+		break;
+
+	case DATA_RECEIVED:
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
+		break;
+
+	case QUEUE_AS_DRBD_BARRIER:
+		start_new_tl_epoch(connection);
+		mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
+		break;
+	};
+
+	return rv;
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ *   BUT we are still/already IN SYNC for this area.
+ *   since size may be bigger than BM_BLOCK_SIZE,
+ *   we may need to check several bits.
+ */
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size)
+{
+	unsigned long sbnr, ebnr;
+	sector_t esector, nr_sectors;
+
+	if (device->state.disk == D_UP_TO_DATE)
+		return true;
+	if (device->state.disk != D_INCONSISTENT)
+		return false;
+	esector = sector + (size >> 9) - 1;
+	nr_sectors = drbd_get_capacity(device->this_bdev);
+	D_ASSERT(device, sector  < nr_sectors);
+	D_ASSERT(device, esector < nr_sectors);
+
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	return drbd_bm_count_bits(device, sbnr, ebnr) == 0;
+}
+
+static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector,
+		enum drbd_read_balancing rbm)
+{
+	struct backing_dev_info *bdi;
+	int stripe_shift;
+
+	switch (rbm) {
+	case RB_CONGESTED_REMOTE:
+		bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+		return bdi_read_congested(bdi);
+	case RB_LEAST_PENDING:
+		return atomic_read(&device->local_cnt) >
+			atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
+	case RB_32K_STRIPING:  /* stripe_shift = 15 */
+	case RB_64K_STRIPING:
+	case RB_128K_STRIPING:
+	case RB_256K_STRIPING:
+	case RB_512K_STRIPING:
+	case RB_1M_STRIPING:   /* stripe_shift = 20 */
+		stripe_shift = (rbm - RB_32K_STRIPING + 15);
+		return (sector >> (stripe_shift - 9)) & 1;
+	case RB_ROUND_ROBIN:
+		return test_and_change_bit(READ_BALANCE_RR, &device->flags);
+	case RB_PREFER_REMOTE:
+		return true;
+	case RB_PREFER_LOCAL:
+	default:
+		return false;
+	}
+}
+
+/*
+ * complete_conflicting_writes  -  wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about.  Wait for any requests to complete which conflict with
+ * the new one.
+ *
+ * Only way out: remove the conflicting intervals from the tree.
+ */
+static void complete_conflicting_writes(struct drbd_request *req)
+{
+	DEFINE_WAIT(wait);
+	struct drbd_device *device = req->device;
+	struct drbd_interval *i;
+	sector_t sector = req->i.sector;
+	int size = req->i.size;
+
+	i = drbd_find_overlap(&device->write_requests, sector, size);
+	if (!i)
+		return;
+
+	for (;;) {
+		prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+		i = drbd_find_overlap(&device->write_requests, sector, size);
+		if (!i)
+			break;
+		/* Indicate to wake up device->misc_wait on progress.  */
+		i->waiting = true;
+		spin_unlock_irq(&device->resource->req_lock);
+		schedule();
+		spin_lock_irq(&device->resource->req_lock);
+	}
+	finish_wait(&device->misc_wait, &wait);
+}
+
+/* called within req_lock and rcu_read_lock() */
+static void maybe_pull_ahead(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct net_conf *nc;
+	bool congested = false;
+	enum drbd_on_congestion on_congestion;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+	rcu_read_unlock();
+	if (on_congestion == OC_BLOCK ||
+	    connection->agreed_pro_version < 96)
+		return;
+
+	if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
+		return; /* nothing to do ... */
+
+	/* If I don't even have good local storage, we can not reasonably try
+	 * to pull ahead of the peer. We also need the local reference to make
+	 * sure device->act_log is there.
+	 */
+	if (!get_ldev_if_state(device, D_UP_TO_DATE))
+		return;
+
+	if (nc->cong_fill &&
+	    atomic_read(&device->ap_in_flight) >= nc->cong_fill) {
+		drbd_info(device, "Congestion-fill threshold reached\n");
+		congested = true;
+	}
+
+	if (device->act_log->used >= nc->cong_extents) {
+		drbd_info(device, "Congestion-extents threshold reached\n");
+		congested = true;
+	}
+
+	if (congested) {
+		/* start a new epoch for non-mirrored writes */
+		start_new_tl_epoch(first_peer_device(device)->connection);
+
+		if (on_congestion == OC_PULL_AHEAD)
+			_drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL);
+		else  /*nc->on_congestion == OC_DISCONNECT */
+			_drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL);
+	}
+	put_ldev(device);
+}
+
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	enum drbd_read_balancing rbm;
+
+	if (req->private_bio) {
+		if (!drbd_may_do_local_read(device,
+					req->i.sector, req->i.size)) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+	}
+
+	if (device->state.pdsk != D_UP_TO_DATE)
+		return false;
+
+	if (req->private_bio == NULL)
+		return true;
+
+	/* TODO: improve read balancing decisions, take into account drbd
+	 * protocol, pending requests etc. */
+
+	rcu_read_lock();
+	rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing;
+	rcu_read_unlock();
+
+	if (rbm == RB_PREFER_LOCAL && req->private_bio)
+		return false; /* submit locally */
+
+	if (remote_due_to_read_balancing(device, req->i.sector, rbm)) {
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+		return true;
+	}
+
+	return false;
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	int remote, send_oos;
+
+	remote = drbd_should_do_remote(device->state);
+	send_oos = drbd_should_send_out_of_sync(device->state);
+
+	/* Need to replicate writes.  Unless it is an empty flush,
+	 * which is better mapped to a DRBD P_BARRIER packet,
+	 * also for drbd wire protocol compatibility reasons.
+	 * If this was a flush, just start a new epoch.
+	 * Unless the current epoch was empty anyways, or we are not currently
+	 * replicating, in which case there is no point. */
+	if (unlikely(req->i.size == 0)) {
+		/* The only size==0 bios we expect are empty flushes. */
+		D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH);
+		if (remote)
+			_req_mod(req, QUEUE_AS_DRBD_BARRIER);
+		return remote;
+	}
+
+	if (!remote && !send_oos)
+		return 0;
+
+	D_ASSERT(device, !(remote && send_oos));
+
+	if (remote) {
+		_req_mod(req, TO_BE_SENT);
+		_req_mod(req, QUEUE_FOR_NET_WRITE);
+	} else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size))
+		_req_mod(req, QUEUE_FOR_SEND_OOS);
+
+	return remote;
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	struct bio *bio = req->private_bio;
+	const int rw = bio_rw(bio);
+
+	bio->bi_bdev = device->ldev->backing_bdev;
+
+	/* State may have changed since we grabbed our reference on the
+	 * ->ldev member. Double check, and short-circuit to endio.
+	 * In case the last activity log transaction failed to get on
+	 * stable storage, and this is a WRITE, we may not even submit
+	 * this bio. */
+	if (get_ldev(device)) {
+		req->pre_submit_jif = jiffies;
+		if (drbd_insert_fault(device,
+				      rw == WRITE ? DRBD_FAULT_DT_WR
+				    : rw == READ  ? DRBD_FAULT_DT_RD
+				    :               DRBD_FAULT_DT_RA))
+			bio_endio(bio, -EIO);
+		else
+			generic_make_request(bio);
+		put_ldev(device);
+	} else
+		bio_endio(bio, -EIO);
+}
+
+static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
+{
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&req->tl_requests, &device->submit.writes);
+	list_add_tail(&req->req_pending_master_completion,
+			&device->pending_master_completion[1 /* WRITE */]);
+	spin_unlock_irq(&device->resource->req_lock);
+	queue_work(device->submit.wq, &device->submit.worker);
+	/* do_submit() may sleep internally on al_wait, too */
+	wake_up(&device->al_wait);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+static struct drbd_request *
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
+{
+	const int rw = bio_data_dir(bio);
+	struct drbd_request *req;
+
+	/* allocate outside of all locks; */
+	req = drbd_req_new(device, bio);
+	if (!req) {
+		dec_ap_bio(device);
+		/* only pass the error to the upper layers.
+		 * if user cannot handle io errors, that's not our business. */
+		drbd_err(device, "could not kmalloc() req\n");
+		bio_endio(bio, -ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+	req->start_jif = start_jif;
+
+	if (!get_ldev(device)) {
+		bio_put(req->private_bio);
+		req->private_bio = NULL;
+	}
+
+	/* Update disk stats */
+	_drbd_start_io_acct(device, req);
+
+	if (rw == WRITE && req->private_bio && req->i.size
+	&& !test_bit(AL_SUSPENDED, &device->flags)) {
+		if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+			atomic_inc(&device->ap_actlog_cnt);
+			drbd_queue_write(device, req);
+			return NULL;
+		}
+		req->rq_state |= RQ_IN_ACT_LOG;
+		req->in_actlog_jif = jiffies;
+	}
+
+	return req;
+}
+
+static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
+{
+	struct drbd_resource *resource = device->resource;
+	const int rw = bio_rw(req->master_bio);
+	struct bio_and_error m = { NULL, };
+	bool no_remote = false;
+	bool submit_private_bio = false;
+
+	spin_lock_irq(&resource->req_lock);
+	if (rw == WRITE) {
+		/* This may temporarily give up the req_lock,
+		 * but will re-aquire it before it returns here.
+		 * Needs to be before the check on drbd_suspended() */
+		complete_conflicting_writes(req);
+		/* no more giving up req_lock from now on! */
+
+		/* check for congestion, and potentially stop sending
+		 * full data updates, but start sending "dirty bits" only. */
+		maybe_pull_ahead(device);
+	}
+
+
+	if (drbd_suspended(device)) {
+		/* push back and retry: */
+		req->rq_state |= RQ_POSTPONED;
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+		goto out;
+	}
+
+	/* We fail READ/READA early, if we can not serve it.
+	 * We must do this before req is registered on any lists.
+	 * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+	if (rw != WRITE) {
+		if (!do_remote_read(req) && !req->private_bio)
+			goto nodata;
+	}
+
+	/* which transfer log epoch does this belong to? */
+	req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr);
+
+	/* no point in adding empty flushes to the transfer log,
+	 * they are mapped to drbd barriers already. */
+	if (likely(req->i.size!=0)) {
+		if (rw == WRITE)
+			first_peer_device(device)->connection->current_tle_writes++;
+
+		list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log);
+	}
+
+	if (rw == WRITE) {
+		if (!drbd_process_write_request(req))
+			no_remote = true;
+	} else {
+		/* We either have a private_bio, or we can read from remote.
+		 * Otherwise we had done the goto nodata above. */
+		if (req->private_bio == NULL) {
+			_req_mod(req, TO_BE_SENT);
+			_req_mod(req, QUEUE_FOR_NET_READ);
+		} else
+			no_remote = true;
+	}
+
+	/* If it took the fast path in drbd_request_prepare, add it here.
+	 * The slow path has added it already. */
+	if (list_empty(&req->req_pending_master_completion))
+		list_add_tail(&req->req_pending_master_completion,
+			&device->pending_master_completion[rw == WRITE]);
+	if (req->private_bio) {
+		/* needs to be marked within the same spinlock */
+		list_add_tail(&req->req_pending_local,
+			&device->pending_completion[rw == WRITE]);
+		_req_mod(req, TO_BE_SUBMITTED);
+		/* but we need to give up the spinlock to submit */
+		submit_private_bio = true;
+	} else if (no_remote) {
+nodata:
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+					(unsigned long long)req->i.sector, req->i.size >> 9);
+		/* A write may have been queued for send_oos, however.
+		 * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
+	}
+
+out:
+	if (drbd_req_put_completion_ref(req, &m, 1))
+		kref_put(&req->kref, drbd_req_destroy);
+	spin_unlock_irq(&resource->req_lock);
+
+	/* Even though above is a kref_put(), this is safe.
+	 * As long as we still need to submit our private bio,
+	 * we hold a completion ref, and the request cannot disappear.
+	 * If however this request did not even have a private bio to submit
+	 * (e.g. remote read), req may already be invalid now.
+	 * That's why we cannot check on req->private_bio. */
+	if (submit_private_bio)
+		drbd_submit_req_private_bio(req);
+	if (m.bio)
+		complete_master_bio(device, &m);
+}
+
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
+{
+	struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
+	if (IS_ERR_OR_NULL(req))
+		return;
+	drbd_send_and_submit(device, req);
+}
+
+static void submit_fast_path(struct drbd_device *device, struct list_head *incoming)
+{
+	struct drbd_request *req, *tmp;
+	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+		const int rw = bio_data_dir(req->master_bio);
+
+		if (rw == WRITE /* rw != WRITE should not even end up here! */
+		&& req->private_bio && req->i.size
+		&& !test_bit(AL_SUSPENDED, &device->flags)) {
+			if (!drbd_al_begin_io_fastpath(device, &req->i))
+				continue;
+
+			req->rq_state |= RQ_IN_ACT_LOG;
+			req->in_actlog_jif = jiffies;
+			atomic_dec(&device->ap_actlog_cnt);
+		}
+
+		list_del_init(&req->tl_requests);
+		drbd_send_and_submit(device, req);
+	}
+}
+
+static bool prepare_al_transaction_nonblock(struct drbd_device *device,
+					    struct list_head *incoming,
+					    struct list_head *pending,
+					    struct list_head *later)
+{
+	struct drbd_request *req, *tmp;
+	int wake = 0;
+	int err;
+
+	spin_lock_irq(&device->al_lock);
+	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+		err = drbd_al_begin_io_nonblock(device, &req->i);
+		if (err == -ENOBUFS)
+			break;
+		if (err == -EBUSY)
+			wake = 1;
+		if (err)
+			list_move_tail(&req->tl_requests, later);
+		else
+			list_move_tail(&req->tl_requests, pending);
+	}
+	spin_unlock_irq(&device->al_lock);
+	if (wake)
+		wake_up(&device->al_wait);
+	return !list_empty(pending);
+}
+
+void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+	struct drbd_request *req, *tmp;
+
+	list_for_each_entry_safe(req, tmp, pending, tl_requests) {
+		req->rq_state |= RQ_IN_ACT_LOG;
+		req->in_actlog_jif = jiffies;
+		atomic_dec(&device->ap_actlog_cnt);
+		list_del_init(&req->tl_requests);
+		drbd_send_and_submit(device, req);
+	}
+}
+
+void do_submit(struct work_struct *ws)
+{
+	struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
+	LIST_HEAD(incoming);	/* from drbd_make_request() */
+	LIST_HEAD(pending);	/* to be submitted after next AL-transaction commit */
+	LIST_HEAD(busy);	/* blocked by resync requests */
+
+	/* grab new incoming requests */
+	spin_lock_irq(&device->resource->req_lock);
+	list_splice_tail_init(&device->submit.writes, &incoming);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	for (;;) {
+		DEFINE_WAIT(wait);
+
+		/* move used-to-be-busy back to front of incoming */
+		list_splice_init(&busy, &incoming);
+		submit_fast_path(device, &incoming);
+		if (list_empty(&incoming))
+			break;
+
+		for (;;) {
+			prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+			list_splice_init(&busy, &incoming);
+			prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+			if (!list_empty(&pending))
+				break;
+
+			schedule();
+
+			/* If all currently "hot" activity log extents are kept busy by
+			 * incoming requests, we still must not totally starve new
+			 * requests to "cold" extents.
+			 * Something left on &incoming means there had not been
+			 * enough update slots available, and the activity log
+			 * has been marked as "starving".
+			 *
+			 * Try again now, without looking for new requests,
+			 * effectively blocking all new requests until we made
+			 * at least _some_ progress with what we currently have.
+			 */
+			if (!list_empty(&incoming))
+				continue;
+
+			/* Nothing moved to pending, but nothing left
+			 * on incoming: all moved to busy!
+			 * Grab new and iterate. */
+			spin_lock_irq(&device->resource->req_lock);
+			list_splice_tail_init(&device->submit.writes, &incoming);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+		finish_wait(&device->al_wait, &wait);
+
+		/* If the transaction was full, before all incoming requests
+		 * had been processed, skip ahead to commit, and iterate
+		 * without splicing in more incoming requests from upper layers.
+		 *
+		 * Else, if all incoming have been processed,
+		 * they have become either "pending" (to be submitted after
+		 * next transaction commit) or "busy" (blocked by resync).
+		 *
+		 * Maybe more was queued, while we prepared the transaction?
+		 * Try to stuff those into this transaction as well.
+		 * Be strictly non-blocking here,
+		 * we already have something to commit.
+		 *
+		 * Commit if we don't make any more progres.
+		 */
+
+		while (list_empty(&incoming)) {
+			LIST_HEAD(more_pending);
+			LIST_HEAD(more_incoming);
+			bool made_progress;
+
+			/* It is ok to look outside the lock,
+			 * it's only an optimization anyways */
+			if (list_empty(&device->submit.writes))
+				break;
+
+			spin_lock_irq(&device->resource->req_lock);
+			list_splice_tail_init(&device->submit.writes, &more_incoming);
+			spin_unlock_irq(&device->resource->req_lock);
+
+			if (list_empty(&more_incoming))
+				break;
+
+			made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
+
+			list_splice_tail_init(&more_pending, &pending);
+			list_splice_tail_init(&more_incoming, &incoming);
+			if (!made_progress)
+				break;
+		}
+
+		drbd_al_begin_io_commit(device);
+		send_and_submit_pending(device, &pending);
+	}
+}
+
+void drbd_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct drbd_device *device = (struct drbd_device *) q->queuedata;
+	unsigned long start_jif;
+
+	start_jif = jiffies;
+
+	/*
+	 * what we "blindly" assume:
+	 */
+	D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
+
+	inc_ap_bio(device);
+	__drbd_make_request(device, bio, start_jif);
+}
+
+/* This is called by bio_add_page().
+ *
+ * q->max_hw_sectors and other global limits are already enforced there.
+ *
+ * We need to call down to our lower level device,
+ * in case it has special restrictions.
+ *
+ * We also may need to enforce configured max-bio-bvecs limits.
+ *
+ * As long as the BIO is empty we have to allow at least one bvec,
+ * regardless of size and offset, so no need to ask lower levels.
+ */
+int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+{
+	struct drbd_device *device = (struct drbd_device *) q->queuedata;
+	unsigned int bio_size = bvm->bi_size;
+	int limit = DRBD_MAX_BIO_SIZE;
+	int backing_limit;
+
+	if (bio_size && get_ldev(device)) {
+		unsigned int max_hw_sectors = queue_max_hw_sectors(q);
+		struct request_queue * const b =
+			device->ldev->backing_bdev->bd_disk->queue;
+		if (b->merge_bvec_fn) {
+			bvm->bi_bdev = device->ldev->backing_bdev;
+			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
+			limit = min(limit, backing_limit);
+		}
+		put_ldev(device);
+		if ((limit >> 9) > max_hw_sectors)
+			limit = max_hw_sectors << 9;
+	}
+	return limit;
+}
+
+void request_timer_fn(unsigned long data)
+{
+	struct drbd_device *device = (struct drbd_device *) data;
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
+	struct net_conf *nc;
+	unsigned long oldest_submit_jif;
+	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+	unsigned long now;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
+		ent = nc->timeout * HZ/10 * nc->ko_count;
+
+	if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
+		dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
+		put_ldev(device);
+	}
+	rcu_read_unlock();
+
+	et = min_not_zero(dt, ent);
+
+	if (!et)
+		return; /* Recurring timer stopped */
+
+	now = jiffies;
+	nt = now + et;
+
+	spin_lock_irq(&device->resource->req_lock);
+	req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+	req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+	req_peer = connection->req_not_net_done;
+	/* maybe the oldest request waiting for the peer is in fact still
+	 * blocking in tcp sendmsg */
+	if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+		req_peer = connection->req_next;
+
+	/* evaluate the oldest peer request only in one timer! */
+	if (req_peer && req_peer->device != device)
+		req_peer = NULL;
+
+	/* do we have something to evaluate? */
+	if (req_peer == NULL && req_write == NULL && req_read == NULL)
+		goto out;
+
+	oldest_submit_jif =
+		(req_write && req_read)
+		? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+		  ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+		: req_write ? req_write->pre_submit_jif
+		: req_read ? req_read->pre_submit_jif : now;
+
+	/* The request is considered timed out, if
+	 * - we have some effective timeout from the configuration,
+	 *   with above state restrictions applied,
+	 * - the oldest request is waiting for a response from the network
+	 *   resp. the local disk,
+	 * - the oldest request is in fact older than the effective timeout,
+	 * - the connection was established (resp. disk was attached)
+	 *   for longer than the timeout already.
+	 * Note that for 32bit jiffies and very stable connections/disks,
+	 * we may have a wrap around, which is catched by
+	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+	 *
+	 * Side effect: once per 32bit wrap-around interval, which means every
+	 * ~198 days with 250 HZ, we have a window where the timeout would need
+	 * to expire twice (worst case) to become effective. Good enough.
+	 */
+	if (ent && req_peer &&
+		 time_after(now, req_peer->pre_send_jif + ent) &&
+		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
+		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+		_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
+	}
+	if (dt && oldest_submit_jif != now &&
+		 time_after(now, oldest_submit_jif + dt) &&
+		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
+		drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
+		__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
+	}
+
+	/* Reschedule timer for the nearest not already expired timeout.
+	 * Fallback to now + min(effective network timeout, disk timeout). */
+	ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+		? req_peer->pre_send_jif + ent : now + et;
+	dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+		? oldest_submit_jif + dt : now + et;
+	nt = time_before(ent, dt) ? ent : dt;
+out:
+	spin_unlock_irq(&device->resource->req_lock);
+	mod_timer(&device->request_timer, nt);
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000..9f6a04080
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,351 @@
+/*
+   drbd_req.h
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+   Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+   DRBD is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   DRBD is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+   and in Softirqs/Tasklets/BH context by the SCSI drivers,
+   and by the receiver and worker in kernel-thread context.
+   Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ *  It will be created.
+ *  It will be marked with the intention to be
+ *    submitted to local disk and/or
+ *    send via the network.
+ *
+ *  It has to be placed on the transfer log and other housekeeping lists,
+ *  In case we have a network connection.
+ *
+ *  It may be identified as a concurrent (write) request
+ *    and be handled accordingly.
+ *
+ *  It may me handed over to the local disk subsystem.
+ *  It may be completed by the local disk subsystem,
+ *    either successfully or with io-error.
+ *  In case it is a READ request, and it failed locally,
+ *    it may be retried remotely.
+ *
+ *  It may be queued for sending.
+ *  It may be handed over to the network stack,
+ *    which may fail.
+ *  It may be acknowledged by the "peer" according to the wire_protocol in use.
+ *    this may be a negative ack.
+ *  It may receive a faked ack when the network connection is lost and the
+ *  transfer log is cleaned up.
+ *  Sending may be canceled due to network connection loss.
+ *  When it finally has outlived its time,
+ *    corresponding dirty bits in the resync-bitmap may be cleared or set,
+ *    it will be destroyed,
+ *    and completion will be signalled to the originator,
+ *      with or without "success".
+ */
+
+enum drbd_req_event {
+	CREATED,
+	TO_BE_SENT,
+	TO_BE_SUBMITTED,
+
+	/* XXX yes, now I am inconsistent...
+	 * these are not "events" but "actions"
+	 * oh, well... */
+	QUEUE_FOR_NET_WRITE,
+	QUEUE_FOR_NET_READ,
+	QUEUE_FOR_SEND_OOS,
+
+	/* An empty flush is queued as P_BARRIER,
+	 * which will cause it to complete "successfully",
+	 * even if the local disk flush failed.
+	 *
+	 * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
+	 * only see an error if neither local nor remote data is reachable. */
+	QUEUE_AS_DRBD_BARRIER,
+
+	SEND_CANCELED,
+	SEND_FAILED,
+	HANDED_OVER_TO_NETWORK,
+	OOS_HANDED_TO_NETWORK,
+	CONNECTION_LOST_WHILE_PENDING,
+	READ_RETRY_REMOTE_CANCELED,
+	RECV_ACKED_BY_PEER,
+	WRITE_ACKED_BY_PEER,
+	WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
+	CONFLICT_RESOLVED,
+	POSTPONE_WRITE,
+	NEG_ACKED,
+	BARRIER_ACKED, /* in protocol A and B */
+	DATA_RECEIVED, /* (remote read) */
+
+	COMPLETED_OK,
+	READ_COMPLETED_WITH_ERROR,
+	READ_AHEAD_COMPLETED_WITH_ERROR,
+	WRITE_COMPLETED_WITH_ERROR,
+	DISCARD_COMPLETED_NOTSUPP,
+	DISCARD_COMPLETED_WITH_ERROR,
+
+	ABORT_DISK_IO,
+	RESEND,
+	FAIL_FROZEN_DISK_IO,
+	RESTART_FROZEN_DISK_IO,
+	NOTHING,
+};
+
+/* encoding of request states for now.  we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+	/* 3210
+	 * 0000: no local possible
+	 * 0001: to be submitted
+	 *    UNUSED, we could map: 011: submitted, completion still pending
+	 * 0110: completed ok
+	 * 0010: completed with error
+	 * 1001: Aborted (before completion)
+	 * 1x10: Aborted and completed -> free
+	 */
+	__RQ_LOCAL_PENDING,
+	__RQ_LOCAL_COMPLETED,
+	__RQ_LOCAL_OK,
+	__RQ_LOCAL_ABORTED,
+
+	/* 87654
+	 * 00000: no network possible
+	 * 00001: to be send
+	 * 00011: to be send, on worker queue
+	 * 00101: sent, expecting recv_ack (B) or write_ack (C)
+	 * 11101: sent,
+	 *        recv_ack (B) or implicit "ack" (A),
+	 *        still waiting for the barrier ack.
+	 *        master_bio may already be completed and invalidated.
+	 * 11100: write acked (C),
+	 *        data received (for remote read, any protocol)
+	 *        or finally the barrier ack has arrived (B,A)...
+	 *        request can be freed
+	 * 01100: neg-acked (write, protocol C)
+	 *        or neg-d-acked (read, any protocol)
+	 *        or killed from the transfer log
+	 *        during cleanup after connection loss
+	 *        request can be freed
+	 * 01000: canceled or send failed...
+	 *        request can be freed
+	 */
+
+	/* if "SENT" is not set, yet, this can still fail or be canceled.
+	 * if "SENT" is set already, we still wait for an Ack packet.
+	 * when cleared, the master_bio may be completed.
+	 * in (B,A) the request object may still linger on the transaction log
+	 * until the corresponding barrier ack comes in */
+	__RQ_NET_PENDING,
+
+	/* If it is QUEUED, and it is a WRITE, it is also registered in the
+	 * transfer log. Currently we need this flag to avoid conflicts between
+	 * worker canceling the request and tl_clear_barrier killing it from
+	 * transfer log.  We should restructure the code so this conflict does
+	 * no longer occur. */
+	__RQ_NET_QUEUED,
+
+	/* well, actually only "handed over to the network stack".
+	 *
+	 * TODO can potentially be dropped because of the similar meaning
+	 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+	 * however it is not exactly the same. before we drop it
+	 * we must ensure that we can tell a request with network part
+	 * from a request without, regardless of what happens to it. */
+	__RQ_NET_SENT,
+
+	/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+	 * basically this means the corresponding P_BARRIER_ACK was received */
+	__RQ_NET_DONE,
+
+	/* whether or not we know (C) or pretend (B,A) that the write
+	 * was successfully written on the peer.
+	 */
+	__RQ_NET_OK,
+
+	/* peer called drbd_set_in_sync() for this write */
+	__RQ_NET_SIS,
+
+	/* keep this last, its for the RQ_NET_MASK */
+	__RQ_NET_MAX,
+
+	/* Set when this is a write, clear for a read */
+	__RQ_WRITE,
+
+	/* Should call drbd_al_complete_io() for this request... */
+	__RQ_IN_ACT_LOG,
+
+	/* The peer has sent a retry ACK */
+	__RQ_POSTPONED,
+
+	/* would have been completed,
+	 * but was not, because of drbd_suspended() */
+	__RQ_COMPLETION_SUSP,
+
+	/* We expect a receive ACK (wire proto B) */
+	__RQ_EXP_RECEIVE_ACK,
+
+	/* We expect a write ACK (wite proto C) */
+	__RQ_EXP_WRITE_ACK,
+
+	/* waiting for a barrier ack, did an extra kref_get */
+	__RQ_EXP_BARR_ACK,
+};
+
+#define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED   (1UL << __RQ_LOCAL_ABORTED)
+
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_ABORTED << 1)-1)
+
+#define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT        (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE        (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK          (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS         (1UL << __RQ_NET_SIS)
+
+/* 0x1f8 */
+#define RQ_NET_MASK        (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+#define RQ_WRITE           (1UL << __RQ_WRITE)
+#define RQ_IN_ACT_LOG      (1UL << __RQ_IN_ACT_LOG)
+#define RQ_POSTPONED	   (1UL << __RQ_POSTPONED)
+#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK   (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK    (1UL << __RQ_EXP_BARR_ACK)
+
+/* For waking up the frozen transfer log mod_req() has to return if the request
+   should be counted in the epoch object*/
+#define MR_WRITE       1
+#define MR_READ        2
+
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+	struct bio *bio;
+	bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+	req->private_bio = bio;
+
+	bio->bi_private  = req;
+	bio->bi_end_io   = drbd_request_endio;
+	bio->bi_next     = NULL;
+}
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_iter.bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+	struct bio *bio;
+	int error;
+};
+
+extern void start_new_tl_epoch(struct drbd_connection *connection);
+extern void drbd_req_destroy(struct kref *kref);
+extern void _req_may_be_done(struct drbd_request *req,
+		struct bio_and_error *m);
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_device *device,
+		struct bio_and_error *m);
+extern void request_timer_fn(unsigned long data);
+extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void tl_abort_disk_io(struct drbd_device *device);
+
+/* this is in drbd_main.c */
+extern void drbd_restart_request(struct drbd_request *req);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	int rv;
+
+	/* __req_mod possibly frees req, do not touch req after that! */
+	rv = __req_mod(req, what, &m);
+	if (m.bio)
+		complete_master_bio(device, &m);
+
+	return rv;
+}
+
+/* completion of master bio is outside of our spinlock.
+ * We still may or may not be inside some irqs disabled section
+ * of the lower level driver completion callback, so we need to
+ * spin_lock_irqsave here. */
+static inline int req_mod(struct drbd_request *req,
+		enum drbd_req_event what)
+{
+	unsigned long flags;
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	int rv;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	rv = __req_mod(req, what, &m);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+
+	return rv;
+}
+
+static inline bool drbd_should_do_remote(union drbd_dev_state s)
+{
+	return s.pdsk == D_UP_TO_DATE ||
+		(s.pdsk >= D_INCONSISTENT &&
+		 s.conn >= C_WF_BITMAP_T &&
+		 s.conn < C_AHEAD);
+	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+	   states. */
+}
+static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+	   since we enter state C_AHEAD only if proto >= 96 */
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
new file mode 100644
index 000000000..2d7dd269b
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.c
@@ -0,0 +1,1918 @@
+/*
+   drbd_state.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+struct after_state_chg_work {
+	struct drbd_work w;
+	struct drbd_device *device;
+	union drbd_state os;
+	union drbd_state ns;
+	enum chg_state_flags flags;
+	struct completion *done;
+};
+
+enum sanitize_state_warnings {
+	NO_WARNING,
+	ABORTED_ONLINE_VERIFY,
+	ABORTED_RESYNC,
+	CONNECTION_LOST_NEGOTIATING,
+	IMPLICITLY_UPGRADED_DISK,
+	IMPLICITLY_UPGRADED_PDSK,
+};
+
+static int w_after_state_ch(struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags);
+static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
+static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
+static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn);
+
+static inline bool is_susp(union drbd_state s)
+{
+        return s.susp || s.susp_nod || s.susp_fen;
+}
+
+bool conn_all_vols_unconf(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = true;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.disk != D_DISKLESS ||
+		    device->state.conn != C_STANDALONE ||
+		    device->state.role != R_SECONDARY) {
+			rv = false;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/* Unfortunately the states where not correctly ordered, when
+   they where defined. therefore can not use max_t() here. */
+static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+{
+	if (role1 == R_PRIMARY || role2 == R_PRIMARY)
+		return R_PRIMARY;
+	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+		return R_SECONDARY;
+	return R_UNKNOWN;
+}
+static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+{
+	if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
+		return R_UNKNOWN;
+	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+		return R_SECONDARY;
+	return R_PRIMARY;
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection)
+{
+	enum drbd_role role = R_UNKNOWN;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		role = max_role(role, device->state.role);
+	}
+	rcu_read_unlock();
+
+	return role;
+}
+
+enum drbd_role conn_highest_peer(struct drbd_connection *connection)
+{
+	enum drbd_role peer = R_UNKNOWN;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		peer = max_role(peer, device->state.peer);
+	}
+	rcu_read_unlock();
+
+	return peer;
+}
+
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state disk_state = D_DISKLESS;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk);
+	}
+	rcu_read_unlock();
+
+	return disk_state;
+}
+
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state disk_state = D_MASK;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
+	}
+	rcu_read_unlock();
+
+	return disk_state;
+}
+
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state disk_state = D_DISKLESS;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk);
+	}
+	rcu_read_unlock();
+
+	return disk_state;
+}
+
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection)
+{
+	enum drbd_conns conn = C_MASK;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		conn = min_t(enum drbd_conns, conn, device->state.conn);
+	}
+	rcu_read_unlock();
+
+	return conn;
+}
+
+static bool no_peer_wf_report_params(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+	bool rv = true;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) {
+			rv = false;
+			break;
+		}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static void wake_up_all_devices(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		wake_up(&peer_device->device->state_wait);
+	rcu_read_unlock();
+
+}
+
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @device:	DRBD device.
+ * @os:		old (current) state.
+ * @ns:		new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_device *device,
+			  union drbd_state os, union drbd_state ns)
+{
+	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
+		(os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+}
+
+static union drbd_state
+apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+{
+	union drbd_state ns;
+	ns.i = (os.i & ~mask.i) | val.i;
+	return ns;
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_device *device, enum chg_state_flags f,
+		  union drbd_state mask, union drbd_state val)
+{
+	unsigned long flags;
+	union drbd_state ns;
+	enum drbd_state_rv rv;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	ns = apply_mask_val(drbd_read_state(device), mask, val);
+	rv = _drbd_set_state(device, ns, f, NULL);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ */
+void drbd_force_state(struct drbd_device *device,
+	union drbd_state mask, union drbd_state val)
+{
+	drbd_change_state(device, CS_HARD, mask, val);
+}
+
+static enum drbd_state_rv
+_req_st_cond(struct drbd_device *device, union drbd_state mask,
+	     union drbd_state val)
+{
+	union drbd_state os, ns;
+	unsigned long flags;
+	enum drbd_state_rv rv;
+
+	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags))
+		return SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags))
+		return SS_CW_FAILED_BY_PEER;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	os = drbd_read_state(device);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+	rv = is_valid_transition(os, ns);
+	if (rv >= SS_SUCCESS)
+		rv = SS_UNKNOWN_ERROR;  /* cont waiting, otherwise fail. */
+
+	if (!cl_wide_st_chg(device, os, ns))
+		rv = SS_CW_NO_NEED;
+	if (rv == SS_UNKNOWN_ERROR) {
+		rv = is_valid_state(device, ns);
+		if (rv >= SS_SUCCESS) {
+			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+			if (rv >= SS_SUCCESS)
+				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+		}
+	}
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static enum drbd_state_rv
+drbd_req_state(struct drbd_device *device, union drbd_state mask,
+	       union drbd_state val, enum chg_state_flags f)
+{
+	struct completion done;
+	unsigned long flags;
+	union drbd_state os, ns;
+	enum drbd_state_rv rv;
+
+	init_completion(&done);
+
+	if (f & CS_SERIALIZE)
+		mutex_lock(device->state_mutex);
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	os = drbd_read_state(device);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+	rv = is_valid_transition(os, ns);
+	if (rv < SS_SUCCESS) {
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+		goto abort;
+	}
+
+	if (cl_wide_st_chg(device, os, ns)) {
+		rv = is_valid_state(device, ns);
+		if (rv == SS_SUCCESS)
+			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+
+		if (drbd_send_state_req(first_peer_device(device), mask, val)) {
+			rv = SS_CW_FAILED_BY_PEER;
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+
+		wait_event(device->state_wait,
+			(rv = _req_st_cond(device, mask, val)));
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+		spin_lock_irqsave(&device->resource->req_lock, flags);
+		ns = apply_mask_val(drbd_read_state(device), mask, val);
+		rv = _drbd_set_state(device, ns, f, &done);
+	} else {
+		rv = _drbd_set_state(device, ns, f, &done);
+	}
+
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+		D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+		wait_for_completion(&done);
+	}
+
+abort:
+	if (f & CS_SERIALIZE)
+		mutex_unlock(device->state_mutex);
+
+	return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_device *device, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
+{
+	enum drbd_state_rv rv;
+
+	wait_event(device->state_wait,
+		   (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+	return rv;
+}
+
+enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
+{
+	enum drbd_state_rv rv;
+
+	BUG_ON(f & CS_SERIALIZE);
+
+	wait_event_cmd(device->state_wait,
+		       (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE,
+		       mutex_unlock(device->state_mutex),
+		       mutex_lock(device->state_mutex));
+
+	return rv;
+}
+
+static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
+{
+	drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
+	    name,
+	    drbd_conn_str(ns.conn),
+	    drbd_role_str(ns.role),
+	    drbd_role_str(ns.peer),
+	    drbd_disk_str(ns.disk),
+	    drbd_disk_str(ns.pdsk),
+	    is_susp(ns) ? 's' : 'r',
+	    ns.aftr_isp ? 'a' : '-',
+	    ns.peer_isp ? 'p' : '-',
+	    ns.user_isp ? 'u' : '-',
+	    ns.susp_fen ? 'F' : '-',
+	    ns.susp_nod ? 'N' : '-'
+	    );
+}
+
+void print_st_err(struct drbd_device *device, union drbd_state os,
+	          union drbd_state ns, enum drbd_state_rv err)
+{
+	if (err == SS_IN_TRANSIENT_STATE)
+		return;
+	drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err));
+	print_st(device, " state", os);
+	print_st(device, "wanted", ns);
+}
+
+static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
+			       enum chg_state_flags flags)
+{
+	char *pbp;
+	pbp = pb;
+	*pbp = 0;
+
+	if (ns.role != os.role && flags & CS_DC_ROLE)
+		pbp += sprintf(pbp, "role( %s -> %s ) ",
+			       drbd_role_str(os.role),
+			       drbd_role_str(ns.role));
+	if (ns.peer != os.peer && flags & CS_DC_PEER)
+		pbp += sprintf(pbp, "peer( %s -> %s ) ",
+			       drbd_role_str(os.peer),
+			       drbd_role_str(ns.peer));
+	if (ns.conn != os.conn && flags & CS_DC_CONN)
+		pbp += sprintf(pbp, "conn( %s -> %s ) ",
+			       drbd_conn_str(os.conn),
+			       drbd_conn_str(ns.conn));
+	if (ns.disk != os.disk && flags & CS_DC_DISK)
+		pbp += sprintf(pbp, "disk( %s -> %s ) ",
+			       drbd_disk_str(os.disk),
+			       drbd_disk_str(ns.disk));
+	if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
+		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+			       drbd_disk_str(os.pdsk),
+			       drbd_disk_str(ns.pdsk));
+
+	return pbp - pb;
+}
+
+static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns,
+				 enum chg_state_flags flags)
+{
+	char pb[300];
+	char *pbp = pb;
+
+	pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+
+	if (ns.aftr_isp != os.aftr_isp)
+		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+			       os.aftr_isp,
+			       ns.aftr_isp);
+	if (ns.peer_isp != os.peer_isp)
+		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+			       os.peer_isp,
+			       ns.peer_isp);
+	if (ns.user_isp != os.user_isp)
+		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+			       os.user_isp,
+			       ns.user_isp);
+
+	if (pbp != pb)
+		drbd_info(device, "%s\n", pb);
+}
+
+static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns,
+				 enum chg_state_flags flags)
+{
+	char pb[300];
+	char *pbp = pb;
+
+	pbp += print_state_change(pbp, os, ns, flags);
+
+	if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
+		pbp += sprintf(pbp, "susp( %d -> %d ) ",
+			       is_susp(os),
+			       is_susp(ns));
+
+	if (pbp != pb)
+		drbd_info(connection, "%s\n", pb);
+}
+
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @device:	DRBD device.
+ * @ns:		State to consider.
+ */
+static enum drbd_state_rv
+is_valid_state(struct drbd_device *device, union drbd_state ns)
+{
+	/* See drbd_state_sw_errors in drbd_strings.c */
+
+	enum drbd_fencing_p fp;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	fp = FP_DONT_CARE;
+	if (get_ldev(device)) {
+		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+		put_ldev(device);
+	}
+
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	if (nc) {
+		if (!nc->two_primaries && ns.role == R_PRIMARY) {
+			if (ns.peer == R_PRIMARY)
+				rv = SS_TWO_PRIMARIES;
+			else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY)
+				rv = SS_O_VOL_PEER_PRI;
+		}
+	}
+
+	if (rv <= 0)
+		/* already found a reason to abort */;
+	else if (ns.role == R_SECONDARY && device->open_cnt)
+		rv = SS_DEVICE_IN_USE;
+
+	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (fp >= FP_RESOURCE &&
+		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+		rv = SS_PRIMARY_NOP;
+
+	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+		rv = SS_NO_LOCAL_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+		rv = SS_NO_REMOTE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if ((ns.conn == C_CONNECTED ||
+		  ns.conn == C_WF_BITMAP_S ||
+		  ns.conn == C_SYNC_SOURCE ||
+		  ns.conn == C_PAUSED_SYNC_S) &&
+		  ns.disk == D_OUTDATED)
+		rv = SS_CONNECTED_OUTDATES;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		 (nc->verify_alg[0] == 0))
+		rv = SS_NO_VERIFY_ALG;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		  first_peer_device(device)->connection->agreed_pro_version < 88)
+		rv = SS_NOT_SUPPORTED;
+
+	else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+                 ns.pdsk == D_UNKNOWN)
+		rv = SS_NEED_CONNECTION;
+
+	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+		rv = SS_CONNECTED_OUTDATES;
+
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
+ * This function limits state transitions that may be declined by DRBD. I.e.
+ * user requests (aka soft transitions).
+ * @device:	DRBD device.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static enum drbd_state_rv
+is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+
+	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+	    os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+		rv = SS_ALREADY_STANDALONE;
+
+	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+		rv = SS_NO_NET_CONFIG;
+
+	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+		rv = SS_LOWER_THAN_OUTDATED;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	/* While establishing a connection only allow cstate to change.
+	   Delay/refuse role changes, detach attach etc... (they do not touch cstate) */
+	if (test_bit(STATE_SENT, &connection->flags) &&
+	    !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) ||
+	      (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+	    ns.conn != os.conn && os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+	    os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+	    && os.conn < C_WF_REPORT_PARAMS)
+		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+	if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+	    os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+		rv = SS_OUTDATE_WO_CONN;
+
+	return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+{
+	/* no change -> nothing to do, at least for the connection part */
+	if (oc == nc)
+		return SS_NOTHING_TO_DO;
+
+	/* disconnect of an unconfigured connection does not make sense */
+	if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+		return SS_ALREADY_STANDALONE;
+
+	/* from C_STANDALONE, we start with C_UNCONNECTED */
+	if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* When establishing a connection we need to go through WF_REPORT_PARAMS!
+	   Necessary to do the right thing upon invalidate-remote on a disconnected resource */
+	if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+	if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+		return SS_IN_TRANSIENT_STATE;
+
+	/* After C_DISCONNECTING only C_STANDALONE may follow */
+	if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+		return SS_IN_TRANSIENT_STATE;
+
+	return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static enum drbd_state_rv
+is_valid_transition(union drbd_state os, union drbd_state ns)
+{
+	enum drbd_state_rv rv;
+
+	rv = is_valid_conn_transition(os.conn, ns.conn);
+
+	/* we cannot fail (again) if we already detached */
+	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn)
+{
+	static const char *msg_table[] = {
+		[NO_WARNING] = "",
+		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+		[ABORTED_RESYNC] = "Resync aborted.",
+		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+	};
+
+	if (warn != NO_WARNING)
+		drbd_warn(device, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @device:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
+{
+	enum drbd_fencing_p fp;
+	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+	if (warn)
+		*warn = NO_WARNING;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(device)) {
+		rcu_read_lock();
+		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+		rcu_read_unlock();
+		put_ldev(device);
+	}
+
+	/* Implications from connection to peer and peer_isp */
+	if (ns.conn < C_CONNECTED) {
+		ns.peer_isp = 0;
+		ns.peer = R_UNKNOWN;
+		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+			ns.pdsk = D_UNKNOWN;
+	}
+
+	/* Clear the aftr_isp when becoming unconfigured */
+	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+		ns.aftr_isp = 0;
+
+	/* An implication of the disk states onto the connection state */
+	/* Abort resync if a disk fails/detaches */
+	if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+		if (warn)
+			*warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
+				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+		ns.conn = C_CONNECTED;
+	}
+
+	/* Connection breaks down before we finished "Negotiating" */
+	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+	    get_ldev_if_state(device, D_NEGOTIATING)) {
+		if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) {
+			ns.disk = device->new_state_tmp.disk;
+			ns.pdsk = device->new_state_tmp.pdsk;
+		} else {
+			if (warn)
+				*warn = CONNECTION_LOST_NEGOTIATING;
+			ns.disk = D_DISKLESS;
+			ns.pdsk = D_UNKNOWN;
+		}
+		put_ldev(device);
+	}
+
+	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+			ns.disk = D_UP_TO_DATE;
+		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+			ns.pdsk = D_UP_TO_DATE;
+	}
+
+	/* Implications of the connection stat on the disk states */
+	disk_min = D_DISKLESS;
+	disk_max = D_UP_TO_DATE;
+	pdsk_min = D_INCONSISTENT;
+	pdsk_max = D_UNKNOWN;
+	switch ((enum drbd_conns)ns.conn) {
+	case C_WF_BITMAP_T:
+	case C_PAUSED_SYNC_T:
+	case C_STARTING_SYNC_T:
+	case C_WF_SYNC_UUID:
+	case C_BEHIND:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_OUTDATED;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_CONNECTED:
+		disk_min = D_DISKLESS;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_DISKLESS;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_WF_BITMAP_S:
+	case C_PAUSED_SYNC_S:
+	case C_STARTING_SYNC_S:
+	case C_AHEAD:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+		break;
+	case C_SYNC_TARGET:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_INCONSISTENT;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_SYNC_SOURCE:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_INCONSISTENT;
+		break;
+	case C_STANDALONE:
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_CONNECTION:
+	case C_WF_REPORT_PARAMS:
+	case C_MASK:
+		break;
+	}
+	if (ns.disk > disk_max)
+		ns.disk = disk_max;
+
+	if (ns.disk < disk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_DISK;
+		ns.disk = disk_min;
+	}
+	if (ns.pdsk > pdsk_max)
+		ns.pdsk = pdsk_max;
+
+	if (ns.pdsk < pdsk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_PDSK;
+		ns.pdsk = pdsk_min;
+	}
+
+	if (fp == FP_STONITH &&
+	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+	if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+		if (ns.conn == C_SYNC_SOURCE)
+			ns.conn = C_PAUSED_SYNC_S;
+		if (ns.conn == C_SYNC_TARGET)
+			ns.conn = C_PAUSED_SYNC_T;
+	} else {
+		if (ns.conn == C_PAUSED_SYNC_S)
+			ns.conn = C_SYNC_SOURCE;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			ns.conn = C_SYNC_TARGET;
+	}
+
+	return ns;
+}
+
+void drbd_resume_al(struct drbd_device *device)
+{
+	if (test_and_clear_bit(AL_SUSPENDED, &device->flags))
+		drbd_info(device, "Resumed AL updates\n");
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
+{
+	if (first_peer_device(device)->connection->agreed_pro_version < 90)
+		device->ov_start_sector = 0;
+	device->rs_total = drbd_bm_bits(device);
+	device->ov_position = 0;
+	if (cs == C_VERIFY_T) {
+		/* starting online verify from an arbitrary position
+		 * does not fit well into the existing protocol.
+		 * on C_VERIFY_T, we initialize ov_left and friends
+		 * implicitly in receive_DataRequest once the
+		 * first P_OV_REQUEST is received */
+		device->ov_start_sector = ~(sector_t)0;
+	} else {
+		unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector);
+		if (bit >= device->rs_total) {
+			device->ov_start_sector =
+				BM_BIT_TO_SECT(device->rs_total - 1);
+			device->rs_total = 1;
+		} else
+			device->rs_total -= bit;
+		device->ov_position = device->ov_start_sector;
+	}
+	device->ov_left = device->rs_total;
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @device:	DRBD device.
+ * @ns:		new state.
+ * @flags:	Flags
+ * @done:	Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+enum drbd_state_rv
+__drbd_set_state(struct drbd_device *device, union drbd_state ns,
+	         enum chg_state_flags flags, struct completion *done)
+{
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	union drbd_state os;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	enum sanitize_state_warnings ssw;
+	struct after_state_chg_work *ascw;
+
+	os = drbd_read_state(device);
+
+	ns = sanitize_state(device, os, ns, &ssw);
+	if (ns.i == os.i)
+		return SS_NOTHING_TO_DO;
+
+	rv = is_valid_transition(os, ns);
+	if (rv < SS_SUCCESS)
+		return rv;
+
+	if (!(flags & CS_HARD)) {
+		/*  pre-state-change checks ; only look at ns  */
+		/* See drbd_state_sw_errors in drbd_strings.c */
+
+		rv = is_valid_state(device, ns);
+		if (rv < SS_SUCCESS) {
+			/* If the old state was illegal as well, then let
+			   this happen...*/
+
+			if (is_valid_state(device, os) == rv)
+				rv = is_valid_soft_transition(os, ns, connection);
+		} else
+			rv = is_valid_soft_transition(os, ns, connection);
+	}
+
+	if (rv < SS_SUCCESS) {
+		if (flags & CS_VERBOSE)
+			print_st_err(device, os, ns, rv);
+		return rv;
+	}
+
+	print_sanitize_warnings(device, ssw);
+
+	drbd_pr_state_change(device, os, ns, flags);
+
+	/* Display changes to the susp* flags that where caused by the call to
+	   sanitize_state(). Only display it here if we where not called from
+	   _conn_request_state() */
+	if (!(flags & CS_DC_SUSP))
+		conn_pr_state_change(connection, os, ns,
+				     (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+
+	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+	 * drbd_ldev_destroy() won't happen before our corresponding
+	 * after_state_ch works run, where we put_ldev again. */
+	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+		atomic_inc(&device->local_cnt);
+
+	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
+		clear_bit(RS_DONE, &device->flags);
+
+	/* changes to local_cnt and device flags should be visible before
+	 * changes to state, which again should be visible before anything else
+	 * depending on that change happens. */
+	smp_wmb();
+	device->state.i = ns.i;
+	device->resource->susp = ns.susp;
+	device->resource->susp_nod = ns.susp_nod;
+	device->resource->susp_fen = ns.susp_fen;
+	smp_wmb();
+
+	/* put replicated vs not-replicated requests in seperate epochs */
+	if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+	    drbd_should_do_remote((union drbd_dev_state)ns.i))
+		start_new_tl_epoch(connection);
+
+	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+		drbd_print_uuids(device, "attached to UUIDs");
+
+	/* Wake up role changes, that were delayed because of connection establishing */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
+	    no_peer_wf_report_params(connection)) {
+		clear_bit(STATE_SENT, &connection->flags);
+		wake_up_all_devices(connection);
+	}
+
+	wake_up(&device->misc_wait);
+	wake_up(&device->state_wait);
+	wake_up(&connection->ping_wait);
+
+	/* Aborted verify run, or we reached the stop sector.
+	 * Log the last position, unless end-of-device. */
+	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+	    ns.conn <= C_CONNECTED) {
+		device->ov_start_sector =
+			BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left);
+		if (device->ov_left)
+			drbd_info(device, "Online Verify reached sector %llu\n",
+				(unsigned long long)device->ov_start_sector);
+	}
+
+	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
+		drbd_info(device, "Syncer continues.\n");
+		device->rs_paused += (long)jiffies
+				  -(long)device->rs_mark_time[device->rs_last_mark];
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&device->resync_timer, jiffies);
+	}
+
+	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
+	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+		drbd_info(device, "Resync suspended\n");
+		device->rs_mark_time[device->rs_last_mark] = jiffies;
+	}
+
+	if (os.conn == C_CONNECTED &&
+	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+		unsigned long now = jiffies;
+		int i;
+
+		set_ov_position(device, ns.conn);
+		device->rs_start = now;
+		device->rs_last_sect_ev = 0;
+		device->ov_last_oos_size = 0;
+		device->ov_last_oos_start = 0;
+
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			device->rs_mark_left[i] = device->ov_left;
+			device->rs_mark_time[i] = now;
+		}
+
+		drbd_rs_controller_reset(device);
+
+		if (ns.conn == C_VERIFY_S) {
+			drbd_info(device, "Starting Online Verify from sector %llu\n",
+					(unsigned long long)device->ov_position);
+			mod_timer(&device->resync_timer, jiffies);
+		}
+	}
+
+	if (get_ldev(device)) {
+		u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+		mdf &= ~MDF_AL_CLEAN;
+		if (test_bit(CRASHED_PRIMARY, &device->flags))
+			mdf |= MDF_CRASHED_PRIMARY;
+		if (device->state.role == R_PRIMARY ||
+		    (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY))
+			mdf |= MDF_PRIMARY_IND;
+		if (device->state.conn > C_WF_REPORT_PARAMS)
+			mdf |= MDF_CONNECTED_IND;
+		if (device->state.disk > D_INCONSISTENT)
+			mdf |= MDF_CONSISTENT;
+		if (device->state.disk > D_OUTDATED)
+			mdf |= MDF_WAS_UP_TO_DATE;
+		if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT)
+			mdf |= MDF_PEER_OUT_DATED;
+		if (mdf != device->ldev->md.flags) {
+			device->ldev->md.flags = mdf;
+			drbd_md_mark_dirty(device);
+		}
+		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+			drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]);
+		put_ldev(device);
+	}
+
+	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+		set_bit(CONSIDER_RESYNC, &device->flags);
+
+	/* Receiver should clean up itself */
+	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+		drbd_thread_stop_nowait(&connection->receiver);
+
+	/* Now the receiver finished cleaning up itself, it should die */
+	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+		drbd_thread_stop_nowait(&connection->receiver);
+
+	/* Upon network failure, we need to restart the receiver. */
+	if (os.conn > C_WF_CONNECTION &&
+	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+		drbd_thread_restart_nowait(&connection->receiver);
+
+	/* Resume AL writing if we get a connection */
+	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+		drbd_resume_al(device);
+		connection->connect_cnt++;
+	}
+
+	/* remember last attach time so request_timer_fn() won't
+	 * kill newly established sessions while we are still trying to thaw
+	 * previously frozen IO */
+	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+	    ns.disk > D_NEGOTIATING)
+		device->last_reattach_jif = jiffies;
+
+	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+	if (ascw) {
+		ascw->os = os;
+		ascw->ns = ns;
+		ascw->flags = flags;
+		ascw->w.cb = w_after_state_ch;
+		ascw->device = device;
+		ascw->done = done;
+		drbd_queue_work(&connection->sender_work,
+				&ascw->w);
+	} else {
+		drbd_err(device, "Could not kmalloc an ascw\n");
+	}
+
+	return rv;
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused)
+{
+	struct after_state_chg_work *ascw =
+		container_of(w, struct after_state_chg_work, w);
+	struct drbd_device *device = ascw->device;
+
+	after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+	if (ascw->flags & CS_WAIT_COMPLETE)
+		complete(ascw->done);
+	kfree(ascw);
+
+	return 0;
+}
+
+static void abw_start_sync(struct drbd_device *device, int rv)
+{
+	if (rv) {
+		drbd_err(device, "Writing the bitmap failed not starting resync.\n");
+		_drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE);
+		return;
+	}
+
+	switch (device->state.conn) {
+	case C_STARTING_SYNC_T:
+		_drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		break;
+	case C_STARTING_SYNC_S:
+		drbd_start_resync(device, C_SYNC_SOURCE);
+		break;
+	}
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+	/* open coded non-blocking drbd_suspend_io(device); */
+	set_bit(SUSPEND_IO, &device->flags);
+
+	drbd_bm_lock(device, why, flags);
+	rv = io_fn(device);
+	drbd_bm_unlock(device);
+
+	drbd_resume_io(device);
+
+	return rv;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @device:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @flags:	Flags
+ */
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags)
+{
+	struct drbd_resource *resource = device->resource;
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	struct sib_info sib;
+
+	sib.sib_reason = SIB_STATE_CHANGE;
+	sib.os = os;
+	sib.ns = ns;
+
+	if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
+	&&  (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
+		clear_bit(CRASHED_PRIMARY, &device->flags);
+		if (device->p_uuid)
+			device->p_uuid[UI_FLAGS] &= ~((u64)2);
+	}
+
+	/* Inform userspace about the change... */
+	drbd_bcast_event(device, &sib);
+
+	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+		drbd_khelper(device, "pri-on-incon-degr");
+
+	/* Here we have the actions that are performed after a
+	   state change. This function might sleep */
+
+	if (ns.susp_nod) {
+		enum drbd_req_event what = NOTHING;
+
+		spin_lock_irq(&device->resource->req_lock);
+		if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED)
+			what = RESEND;
+
+		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+		    conn_lowest_disk(connection) > D_NEGOTIATING)
+			what = RESTART_FROZEN_DISK_IO;
+
+		if (resource->susp_nod && what != NOTHING) {
+			_tl_restart(connection, what);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_nod = 1 } },
+					    (union drbd_state) { { .susp_nod = 0 } },
+					    CS_VERBOSE);
+		}
+		spin_unlock_irq(&device->resource->req_lock);
+	}
+
+	if (ns.susp_fen) {
+		spin_lock_irq(&device->resource->req_lock);
+		if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
+			/* case2: The connection was established again: */
+			struct drbd_peer_device *peer_device;
+			int vnr;
+
+			rcu_read_lock();
+			idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+				clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
+			rcu_read_unlock();
+			_tl_restart(connection, RESEND);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE);
+		}
+		spin_unlock_irq(&device->resource->req_lock);
+	}
+
+	/* Became sync source.  With protocol >= 96, we still need to send out
+	 * the sync uuid now. Need to do that before any drbd_send_state, or
+	 * the other side may go "paused sync" before receiving the sync uuids,
+	 * which is unexpected. */
+	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+	    connection->agreed_pro_version >= 96 && get_ldev(device)) {
+		drbd_gen_and_send_sync_uuid(peer_device);
+		put_ldev(device);
+	}
+
+	/* Do not change the order of the if above and the two below... */
+	if (os.pdsk == D_DISKLESS &&
+	    ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) {      /* attach on the peer */
+		/* we probably will start a resync soon.
+		 * make sure those things are properly reset. */
+		device->rs_total = 0;
+		device->rs_failed = 0;
+		atomic_set(&device->rs_pending_cnt, 0);
+		drbd_rs_cancel_all(device);
+
+		drbd_send_uuids(peer_device);
+		drbd_send_state(peer_device, ns);
+	}
+	/* No point in queuing send_bitmap if we don't have a connection
+	 * anymore, so check also the _current_ state, not only the new state
+	 * at the time this work was queued. */
+	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+	    device->state.conn == C_WF_BITMAP_S)
+		drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL,
+				"send_bitmap (WFBitMapS)",
+				BM_LOCKED_TEST_ALLOWED);
+
+	/* Lost contact to peer's copy of the data */
+	if ((os.pdsk >= D_INCONSISTENT &&
+	     os.pdsk != D_UNKNOWN &&
+	     os.pdsk != D_OUTDATED)
+	&&  (ns.pdsk < D_INCONSISTENT ||
+	     ns.pdsk == D_UNKNOWN ||
+	     ns.pdsk == D_OUTDATED)) {
+		if (get_ldev(device)) {
+			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+			    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+				if (drbd_suspended(device)) {
+					set_bit(NEW_CUR_UUID, &device->flags);
+				} else {
+					drbd_uuid_new_current(device);
+					drbd_send_uuids(peer_device);
+				}
+			}
+			put_ldev(device);
+		}
+	}
+
+	if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
+		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+			drbd_uuid_new_current(device);
+			drbd_send_uuids(peer_device);
+		}
+		/* D_DISKLESS Peer becomes secondary */
+		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+			/* We may still be Primary ourselves.
+			 * No harm done if the bitmap still changes,
+			 * redirtied pages will follow later. */
+			drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
+		put_ldev(device);
+	}
+
+	/* Write out all changed bits on demote.
+	 * Though, no need to da that just yet
+	 * if there is a resync going on still */
+	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+		device->state.conn <= C_CONNECTED && get_ldev(device)) {
+		/* No changes to the bitmap expected this time, so assert that,
+		 * even though no harm was done if it did change. */
+		drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+				"demote", BM_LOCKED_TEST_ALLOWED);
+		put_ldev(device);
+	}
+
+	/* Last part of the attaching process ... */
+	if (ns.conn >= C_CONNECTED &&
+	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+		drbd_send_sizes(peer_device, 0, 0);  /* to start sync... */
+		drbd_send_uuids(peer_device);
+		drbd_send_state(peer_device, ns);
+	}
+
+	/* We want to pause/continue resync, tell peer. */
+	if (ns.conn >= C_CONNECTED &&
+	     ((os.aftr_isp != ns.aftr_isp) ||
+	      (os.user_isp != ns.user_isp)))
+		drbd_send_state(peer_device, ns);
+
+	/* In case one of the isp bits got set, suspend other devices. */
+	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+		suspend_other_sg(device);
+
+	/* Make sure the peer gets informed about eventual state
+	   changes (ISP bits) while we were in WFReportParams. */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+		drbd_send_state(peer_device, ns);
+
+	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+		drbd_send_state(peer_device, ns);
+
+	/* We are in the progress to start a full sync... */
+	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+		/* no other bitmap changes expected during this phase */
+		drbd_queue_bitmap_io(device,
+			&drbd_bmio_set_n_write, &abw_start_sync,
+			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+	/* first half of local IO error, failure to attach,
+	 * or administrative detach */
+	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+		enum drbd_io_error_p eh = EP_PASS_ON;
+		int was_io_error = 0;
+		/* corresponding get_ldev was in __drbd_set_state, to serialize
+		 * our cleanup here with the transition to D_DISKLESS.
+		 * But is is still not save to dreference ldev here, since
+		 * we might come from an failed Attach before ldev was set. */
+		if (device->ldev) {
+			rcu_read_lock();
+			eh = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+			rcu_read_unlock();
+
+			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
+
+			if (was_io_error && eh == EP_CALL_HELPER)
+				drbd_khelper(device, "local-io-error");
+
+			/* Immediately allow completion of all application IO,
+			 * that waits for completion from the local disk,
+			 * if this was a force-detach due to disk_timeout
+			 * or administrator request (drbdsetup detach --force).
+			 * Do NOT abort otherwise.
+			 * Aborting local requests may cause serious problems,
+			 * if requests are completed to upper layers already,
+			 * and then later the already submitted local bio completes.
+			 * This can cause DMA into former bio pages that meanwhile
+			 * have been re-used for other things.
+			 * So aborting local requests may cause crashes,
+			 * or even worse, silent data corruption.
+			 */
+			if (test_and_clear_bit(FORCE_DETACH, &device->flags))
+				tl_abort_disk_io(device);
+
+			/* current state still has to be D_FAILED,
+			 * there is only one way out: to D_DISKLESS,
+			 * and that may only happen after our put_ldev below. */
+			if (device->state.disk != D_FAILED)
+				drbd_err(device,
+					"ASSERT FAILED: disk is %s during detach\n",
+					drbd_disk_str(device->state.disk));
+
+			if (ns.conn >= C_CONNECTED)
+				drbd_send_state(peer_device, ns);
+
+			drbd_rs_cancel_all(device);
+
+			/* In case we want to get something to stable storage still,
+			 * this may be the last chance.
+			 * Following put_ldev may transition to D_DISKLESS. */
+			drbd_md_sync(device);
+		}
+		put_ldev(device);
+	}
+
+	/* second half of local IO error, failure to attach,
+	 * or administrative detach,
+	 * after local_cnt references have reached zero again */
+	if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+		/* We must still be diskless,
+		 * re-attach has to be serialized with this! */
+		if (device->state.disk != D_DISKLESS)
+			drbd_err(device,
+				 "ASSERT FAILED: disk is %s while going diskless\n",
+				 drbd_disk_str(device->state.disk));
+
+		if (ns.conn >= C_CONNECTED)
+			drbd_send_state(peer_device, ns);
+		/* corresponding get_ldev in __drbd_set_state
+		 * this may finally trigger drbd_ldev_destroy. */
+		put_ldev(device);
+	}
+
+	/* Notify peer that I had a local IO error, and did not detached.. */
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+		drbd_send_state(peer_device, ns);
+
+	/* Disks got bigger while they were detached */
+	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+	    test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) {
+		if (ns.conn == C_CONNECTED)
+			resync_after_online_grow(device);
+	}
+
+	/* A resync finished or aborted, wake paused devices... */
+	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+	    (os.peer_isp && !ns.peer_isp) ||
+	    (os.user_isp && !ns.user_isp))
+		resume_next_sg(device);
+
+	/* sync target done with resync.  Explicitly notify peer, even though
+	 * it should (at least for non-empty resyncs) already know itself. */
+	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+		drbd_send_state(peer_device, ns);
+
+	/* Verify finished, or reached stop sector.  Peer did not know about
+	 * the stop sector, and we may even have changed the stop sector during
+	 * verify to interrupt/stop early.  Send the new state. */
+	if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
+	&& verify_can_do_stop_sector(device))
+		drbd_send_state(peer_device, ns);
+
+	/* This triggers bitmap writeout of potentially still unwritten pages
+	 * if the resync finished cleanly, or aborted because of peer disk
+	 * failure, or because of connection loss.
+	 * For resync aborted because of local disk failure, we cannot do
+	 * any bitmap writeout anymore.
+	 * No harm done if some bits change during this phase.
+	 */
+	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) {
+		drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
+			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+		put_ldev(device);
+	}
+
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY) {
+		if (os.aftr_isp != ns.aftr_isp)
+			resume_next_sg(device);
+	}
+
+	drbd_md_sync(device);
+}
+
+struct after_conn_state_chg_work {
+	struct drbd_work w;
+	enum drbd_conns oc;
+	union drbd_state ns_min;
+	union drbd_state ns_max; /* new, max state, over all devices */
+	enum chg_state_flags flags;
+	struct drbd_connection *connection;
+};
+
+static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+{
+	struct after_conn_state_chg_work *acscw =
+		container_of(w, struct after_conn_state_chg_work, w);
+	struct drbd_connection *connection = acscw->connection;
+	enum drbd_conns oc = acscw->oc;
+	union drbd_state ns_max = acscw->ns_max;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	kfree(acscw);
+
+	/* Upon network configuration, we need to start the receiver */
+	if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
+		drbd_thread_start(&connection->receiver);
+
+	if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
+		struct net_conf *old_conf;
+
+		mutex_lock(&connection->resource->conf_update);
+		old_conf = connection->net_conf;
+		connection->my_addr_len = 0;
+		connection->peer_addr_len = 0;
+		RCU_INIT_POINTER(connection->net_conf, NULL);
+		conn_free_crypto(connection);
+		mutex_unlock(&connection->resource->conf_update);
+
+		synchronize_rcu();
+		kfree(old_conf);
+	}
+
+	if (ns_max.susp_fen) {
+		/* case1: The outdate peer handler is successful: */
+		if (ns_max.pdsk <= D_OUTDATED) {
+			rcu_read_lock();
+			idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+				struct drbd_device *device = peer_device->device;
+				if (test_bit(NEW_CUR_UUID, &device->flags)) {
+					drbd_uuid_new_current(device);
+					clear_bit(NEW_CUR_UUID, &device->flags);
+				}
+			}
+			rcu_read_unlock();
+			spin_lock_irq(&connection->resource->req_lock);
+			_tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE);
+			spin_unlock_irq(&connection->resource->req_lock);
+		}
+	}
+	kref_put(&connection->kref, drbd_destroy_connection);
+
+	conn_md_sync(connection);
+
+	return 0;
+}
+
+static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+{
+	enum chg_state_flags flags = ~0;
+	struct drbd_peer_device *peer_device;
+	int vnr, first_vol = 1;
+	union drbd_dev_state os, cs = {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = connection->cstate,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		} };
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		os = device->state;
+
+		if (first_vol) {
+			cs = os;
+			first_vol = 0;
+			continue;
+		}
+
+		if (cs.role != os.role)
+			flags &= ~CS_DC_ROLE;
+
+		if (cs.peer != os.peer)
+			flags &= ~CS_DC_PEER;
+
+		if (cs.conn != os.conn)
+			flags &= ~CS_DC_CONN;
+
+		if (cs.disk != os.disk)
+			flags &= ~CS_DC_DISK;
+
+		if (cs.pdsk != os.pdsk)
+			flags &= ~CS_DC_PDSK;
+	}
+	rcu_read_unlock();
+
+	*pf |= CS_DC_MASK;
+	*pf &= flags;
+	(*pcs).i = cs.i;
+}
+
+static enum drbd_state_rv
+conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+			 enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+	union drbd_state ns, os;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		os = drbd_read_state(device);
+		ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+
+		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+			ns.disk = os.disk;
+
+		if (ns.i == os.i)
+			continue;
+
+		rv = is_valid_transition(os, ns);
+
+		if (rv >= SS_SUCCESS && !(flags & CS_HARD)) {
+			rv = is_valid_state(device, ns);
+			if (rv < SS_SUCCESS) {
+				if (is_valid_state(device, os) == rv)
+					rv = is_valid_soft_transition(os, ns, connection);
+			} else
+				rv = is_valid_soft_transition(os, ns, connection);
+		}
+
+		if (rv < SS_SUCCESS) {
+			if (flags & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static void
+conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+	       union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+{
+	union drbd_state ns, os, ns_max = { };
+	union drbd_state ns_min = {
+		{ .role = R_MASK,
+		  .peer = R_MASK,
+		  .conn = val.conn,
+		  .disk = D_MASK,
+		  .pdsk = D_MASK
+		} };
+	struct drbd_peer_device *peer_device;
+	enum drbd_state_rv rv;
+	int vnr, number_of_volumes = 0;
+
+	if (mask.conn == C_MASK) {
+		/* remember last connect time so request_timer_fn() won't
+		 * kill newly established sessions while we are still trying to thaw
+		 * previously frozen IO */
+		if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
+			connection->last_reconnect_jif = jiffies;
+
+		connection->cstate = val.conn;
+	}
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		number_of_volumes++;
+		os = drbd_read_state(device);
+		ns = apply_mask_val(os, mask, val);
+		ns = sanitize_state(device, os, ns, NULL);
+
+		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+			ns.disk = os.disk;
+
+		rv = __drbd_set_state(device, ns, flags, NULL);
+		if (rv < SS_SUCCESS)
+			BUG();
+
+		ns.i = device->state.i;
+		ns_max.role = max_role(ns.role, ns_max.role);
+		ns_max.peer = max_role(ns.peer, ns_max.peer);
+		ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
+		ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
+		ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
+
+		ns_min.role = min_role(ns.role, ns_min.role);
+		ns_min.peer = min_role(ns.peer, ns_min.peer);
+		ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
+		ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
+		ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+	}
+	rcu_read_unlock();
+
+	if (number_of_volumes == 0) {
+		ns_min = ns_max = (union drbd_state) { {
+				.role = R_SECONDARY,
+				.peer = R_UNKNOWN,
+				.conn = val.conn,
+				.disk = D_DISKLESS,
+				.pdsk = D_UNKNOWN
+			} };
+	}
+
+	ns_min.susp = ns_max.susp = connection->resource->susp;
+	ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod;
+	ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen;
+
+	*pns_min = ns_min;
+	*pns_max = ns_max;
+}
+
+static enum drbd_state_rv
+_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+	enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
+
+	if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
+		rv = SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
+		rv = SS_CW_FAILED_BY_PEER;
+
+	err = conn_is_valid_transition(connection, mask, val, 0);
+	if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
+		return rv;
+
+	return err;
+}
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		    enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct after_conn_state_chg_work *acscw;
+	enum drbd_conns oc = connection->cstate;
+	union drbd_state ns_max, ns_min, os;
+	bool have_mutex = false;
+
+	if (mask.conn) {
+		rv = is_valid_conn_transition(oc, val.conn);
+		if (rv < SS_SUCCESS)
+			goto abort;
+	}
+
+	rv = conn_is_valid_transition(connection, mask, val, flags);
+	if (rv < SS_SUCCESS)
+		goto abort;
+
+	if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
+	    !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
+
+		/* This will be a cluster-wide state change.
+		 * Need to give up the spinlock, grab the mutex,
+		 * then send the state change request, ... */
+		spin_unlock_irq(&connection->resource->req_lock);
+		mutex_lock(&connection->cstate_mutex);
+		have_mutex = true;
+
+		set_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+		if (conn_send_state_req(connection, mask, val)) {
+			/* sending failed. */
+			clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+			rv = SS_CW_FAILED_BY_PEER;
+			/* need to re-aquire the spin lock, though */
+			goto abort_unlocked;
+		}
+
+		if (val.conn == C_DISCONNECTING)
+			set_bit(DISCONNECT_SENT, &connection->flags);
+
+		/* ... and re-aquire the spinlock.
+		 * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
+		 * conn_set_state() within the same spinlock. */
+		spin_lock_irq(&connection->resource->req_lock);
+		wait_event_lock_irq(connection->ping_wait,
+				(rv = _conn_rq_cond(connection, mask, val)),
+				connection->resource->req_lock);
+		clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+		if (rv < SS_SUCCESS)
+			goto abort;
+	}
+
+	conn_old_common_state(connection, &os, &flags);
+	flags |= CS_DC_SUSP;
+	conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
+	conn_pr_state_change(connection, os, ns_max, flags);
+
+	acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
+	if (acscw) {
+		acscw->oc = os.conn;
+		acscw->ns_min = ns_min;
+		acscw->ns_max = ns_max;
+		acscw->flags = flags;
+		acscw->w.cb = w_after_conn_state_ch;
+		kref_get(&connection->kref);
+		acscw->connection = connection;
+		drbd_queue_work(&connection->sender_work, &acscw->w);
+	} else {
+		drbd_err(connection, "Could not kmalloc an acscw\n");
+	}
+
+ abort:
+	if (have_mutex) {
+		/* mutex_unlock() "... must not be used in interrupt context.",
+		 * so give up the spinlock, then re-aquire it */
+		spin_unlock_irq(&connection->resource->req_lock);
+ abort_unlocked:
+		mutex_unlock(&connection->cstate_mutex);
+		spin_lock_irq(&connection->resource->req_lock);
+	}
+	if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
+		drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv));
+		drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
+		drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
+	}
+	return rv;
+}
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		   enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv;
+
+	spin_lock_irq(&connection->resource->req_lock);
+	rv = _conn_request_state(connection, mask, val, flags);
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	return rv;
+}
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
new file mode 100644
index 000000000..7f53c4082
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.h
@@ -0,0 +1,166 @@
+#ifndef DRBD_STATE_H
+#define DRBD_STATE_H
+
+struct drbd_device;
+struct drbd_connection;
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+#define NS(T, S) \
+	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+	({ union drbd_state val;  val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+enum chg_state_flags {
+	CS_HARD	         = 1 << 0,
+	CS_VERBOSE       = 1 << 1,
+	CS_WAIT_COMPLETE = 1 << 2,
+	CS_SERIALIZE     = 1 << 3,
+	CS_ORDERED       = CS_WAIT_COMPLETE + CS_SERIALIZE,
+	CS_LOCAL_ONLY    = 1 << 4, /* Do not consider a device pair wide state change */
+	CS_DC_ROLE       = 1 << 5, /* DC = display as connection state change */
+	CS_DC_PEER       = 1 << 6,
+	CS_DC_CONN       = 1 << 7,
+	CS_DC_DISK       = 1 << 8,
+	CS_DC_PDSK       = 1 << 9,
+	CS_DC_SUSP       = 1 << 10,
+	CS_DC_MASK       = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
+	CS_IGN_OUTD_FAIL = 1 << 11,
+};
+
+/* drbd_dev_state and drbd_state are different types. This is to stress the
+   small difference. There is no suspended flag (.susp), and no suspended
+   while fence handler runs flas (susp_fen). */
+union drbd_dev_state {
+	struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned _unused:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned peer_isp:1 ;
+		unsigned user_isp:1 ;
+		unsigned _pad:11;   /* 0	 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+		unsigned _pad:11;
+		unsigned user_isp:1 ;
+		unsigned peer_isp:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned _unused:1 ;
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+	};
+	unsigned int i;
+};
+
+extern enum drbd_state_rv drbd_change_state(struct drbd_device *device,
+					    enum chg_state_flags f,
+					    union drbd_state mask,
+					    union drbd_state val);
+extern void drbd_force_state(struct drbd_device *, union drbd_state,
+			union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
+					      union drbd_state,
+					      union drbd_state,
+					      enum chg_state_flags);
+
+extern enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
+					union drbd_state, enum chg_state_flags);
+
+extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
+					   enum chg_state_flags,
+					   struct completion *done);
+extern void print_st_err(struct drbd_device *, union drbd_state,
+			union drbd_state, int);
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		    enum chg_state_flags flags);
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		   enum chg_state_flags flags);
+
+extern void drbd_resume_al(struct drbd_device *device);
+extern bool conn_all_vols_unconf(struct drbd_connection *connection);
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_device *device,
+				     union drbd_state mask,
+				     union drbd_state val)
+{
+	return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection);
+enum drbd_role conn_highest_peer(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection);
+
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000..80b0f63c7
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,118 @@
+/*
+  drbd.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+
+static const char *drbd_conn_s_names[] = {
+	[C_STANDALONE]       = "StandAlone",
+	[C_DISCONNECTING]    = "Disconnecting",
+	[C_UNCONNECTED]      = "Unconnected",
+	[C_TIMEOUT]          = "Timeout",
+	[C_BROKEN_PIPE]      = "BrokenPipe",
+	[C_NETWORK_FAILURE]  = "NetworkFailure",
+	[C_PROTOCOL_ERROR]   = "ProtocolError",
+	[C_WF_CONNECTION]    = "WFConnection",
+	[C_WF_REPORT_PARAMS] = "WFReportParams",
+	[C_TEAR_DOWN]        = "TearDown",
+	[C_CONNECTED]        = "Connected",
+	[C_STARTING_SYNC_S]  = "StartingSyncS",
+	[C_STARTING_SYNC_T]  = "StartingSyncT",
+	[C_WF_BITMAP_S]      = "WFBitMapS",
+	[C_WF_BITMAP_T]      = "WFBitMapT",
+	[C_WF_SYNC_UUID]     = "WFSyncUUID",
+	[C_SYNC_SOURCE]      = "SyncSource",
+	[C_SYNC_TARGET]      = "SyncTarget",
+	[C_PAUSED_SYNC_S]    = "PausedSyncS",
+	[C_PAUSED_SYNC_T]    = "PausedSyncT",
+	[C_VERIFY_S]         = "VerifyS",
+	[C_VERIFY_T]         = "VerifyT",
+	[C_AHEAD]            = "Ahead",
+	[C_BEHIND]           = "Behind",
+};
+
+static const char *drbd_role_s_names[] = {
+	[R_PRIMARY]   = "Primary",
+	[R_SECONDARY] = "Secondary",
+	[R_UNKNOWN]   = "Unknown"
+};
+
+static const char *drbd_disk_s_names[] = {
+	[D_DISKLESS]     = "Diskless",
+	[D_ATTACHING]    = "Attaching",
+	[D_FAILED]       = "Failed",
+	[D_NEGOTIATING]  = "Negotiating",
+	[D_INCONSISTENT] = "Inconsistent",
+	[D_OUTDATED]     = "Outdated",
+	[D_UNKNOWN]      = "DUnknown",
+	[D_CONSISTENT]   = "Consistent",
+	[D_UP_TO_DATE]   = "UpToDate",
+};
+
+static const char *drbd_state_sw_errors[] = {
+	[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+	[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
+	[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+	[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+	[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+	[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+	[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+	[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+	[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+	[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+	[-SS_DEVICE_IN_USE] = "Device is held open by someone",
+	[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+	[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+	[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+	[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+	[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+	[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+	[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+	[-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
+	[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+	/* enums are unsigned... */
+	return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+	return s > R_SECONDARY   ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+	return s > D_UP_TO_DATE    ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_rv err)
+{
+	return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+	       err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+			: drbd_state_sw_errors[-err];
+}
diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h
new file mode 100644
index 000000000..f9923cc88
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.h
@@ -0,0 +1,9 @@
+#ifndef __DRBD_STRINGS_H
+#define __DRBD_STRINGS_H
+
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_rv);
+
+#endif  /* __DRBD_STRINGS_H */
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000..8cb1532a3
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
+/*
+-*- linux-c -*-
+   drbd_receiver.c
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transferred bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ *  * simple byte-based
+ *  * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total).  The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix    data bits                                    max val  Nº data bits
+0 x                                                         0x2            1
+10 x                                                        0x4            1
+110 xx                                                      0x8            2
+1110 xxx                                                   0x10            3
+11110 xxx xx                                               0x30            5
+111110 xx xxxxxx                                          0x130            8
+11111100  xxxxxxxx xxxxx                                 0x2130           13
+11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
+11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
+11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted   x                                0.29
+ as plaintext x                                  ........................
+             x                                   ........................
+            x                                    ........................
+           x    0.59                         0.21........................
+          x      ........................................................
+         x       .. c ...................................................
+        x    0.44.. o ...................................................
+       x .......... d ...................................................
+      x  .......... e ...................................................
+     X.............   ...................................................
+    x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ...........................  plain bits  ..........
+-+-----------------------------------------------------------------------
+ 1             16              32                              64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+	LEVEL( 2, 1, 0x00); \
+	LEVEL( 3, 2, 0x01); \
+	LEVEL( 5, 3, 0x03); \
+	LEVEL( 7, 4, 0x07); \
+	LEVEL(10, 5, 0x0f); \
+	LEVEL(14, 6, 0x1f); \
+	LEVEL(21, 8, 0x3f); \
+	LEVEL(29, 8, 0x7f); \
+	LEVEL(42, 8, 0xbf); \
+	LEVEL(64, 8, 0xff); \
+	} while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+	u64 adj = 1;
+
+#define LEVEL(t,b,v)					\
+	do {						\
+		if ((in & ((1 << b) -1)) == v) {	\
+			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
+			return t;			\
+		}					\
+		adj += 1ULL << (t - b);			\
+	} while (0)
+
+	VLI_L_1_1();
+
+	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
+	BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+	u64 max = 0;
+	u64 adj = 1;
+
+	if (in == 0)
+		return -EINVAL;
+
+#define LEVEL(t,b,v) do {		\
+		max += 1ULL << (t - b);	\
+		if (in <= max) {	\
+			if (out)	\
+				*out = ((in - adj) << b) | v;	\
+			return t;	\
+		}			\
+		adj = max + 1;		\
+	} while (0)
+
+	VLI_L_1_1();
+
+	return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+	/* the current byte */
+	u8 *b;
+	/* the current bit within *b, nomalized: 0..7 */
+	unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+	cur->b = s;
+	cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+	bits += cur->bit;
+	cur->b = cur->b + (bits >> 3);
+	cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+	struct bitstream_cursor cur;
+	unsigned char *buf;
+	size_t buf_len;		/* in bytes */
+
+	/* for input stream:
+	 * number of trailing 0 bits for padding
+	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
+	unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+	bs->buf = s;
+	bs->buf_len = len;
+	bs->pad_bits = pad_bits;
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+	memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+	unsigned char *b = bs->cur.b;
+	unsigned int tmp;
+
+	if (bits == 0)
+		return 0;
+
+	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+		return -ENOBUFS;
+
+	/* paranoia: strip off hi bits; they should not be set anyways. */
+	if (bits < 64)
+		val &= ~0ULL >> (64 - bits);
+
+	*b++ |= (val & 0xff) << bs->cur.bit;
+
+	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+		*b++ |= (val >> tmp) & 0xff;
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+	u64 val;
+	unsigned int n;
+
+	if (bits > 64)
+		return -EINVAL;
+
+	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+			- bs->cur.bit - bs->pad_bits;
+
+	if (bits == 0) {
+		*out = 0;
+		return 0;
+	}
+
+	/* get the high bits */
+	val = 0;
+	n = (bs->cur.bit + bits + 7) >> 3;
+	/* n may be at most 9, if cur.bit + bits > 64 */
+	/* which means this copies at most 8 byte */
+	if (n) {
+		memcpy(&val, bs->cur.b+1, n - 1);
+		val = le64_to_cpu(val) << (8 - bs->cur.bit);
+	}
+
+	/* we still need the low bits */
+	val |= bs->cur.b[0] >> bs->cur.bit;
+
+	/* and mask out bits we don't want */
+	val &= ~0ULL >> (64 - bits);
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	*out = val;
+
+	return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ *  > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+	u64 code = code;
+	int bits = __vli_encode_bits(&code, in);
+
+	if (bits <= 0)
+		return bits;
+
+	return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000..d0fae55d8
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,2156 @@
+/*
+   drbd_worker.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+static int make_ov_request(struct drbd_device *, int);
+static int make_resync_request(struct drbd_device *, int);
+
+/* endio handlers:
+ *   drbd_md_endio (defined here)
+ *   drbd_request_endio (defined here)
+ *   drbd_peer_request_endio (defined here)
+ *   drbd_bm_endio (defined in drbd_bitmap.c)
+ *
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+   Each state transition on an device holds a read lock. In case we have
+   to evaluate the resync after dependencies, we grab a write lock, because
+   we need stable states on all devices for that.  */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_endio(struct bio *bio, int error)
+{
+	struct drbd_device *device;
+
+	device = bio->bi_private;
+	device->md_io.error = error;
+
+	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+	 * to timeout on the lower level device, and eventually detach from it.
+	 * If this io completion runs after that timeout expired, this
+	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
+	 * During normal operation, this only puts that extra reference
+	 * down to 1 again.
+	 * Make sure we first drop the reference, and only then signal
+	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+	 * next drbd_md_sync_page_io(), that we trigger the
+	 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
+	 */
+	drbd_md_put_buffer(device);
+	device->md_io.done = 1;
+	wake_up(&device->misc_wait);
+	bio_put(bio);
+	if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+		put_ldev(device);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	device->read_cnt += peer_req->i.size >> 9;
+	list_del(&peer_req->w.list);
+	if (list_empty(&device->read_ee))
+		wake_up(&device->ee_wait);
+	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+		__drbd_chk_io_error(device, DRBD_READ_ERROR);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
+	put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver, final stage.  */
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_interval i;
+	int do_wake;
+	u64 block_id;
+	int do_al_complete_io;
+
+	/* after we moved peer_req to done_ee,
+	 * we may no longer access it,
+	 * it may be freed/reused already!
+	 * (as soon as we release the req_lock) */
+	i = peer_req->i;
+	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+	block_id = peer_req->block_id;
+	peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	device->writ_cnt += peer_req->i.size >> 9;
+	list_move_tail(&peer_req->w.list, &device->done_ee);
+
+	/*
+	 * Do not remove from the write_requests tree here: we did not send the
+	 * Ack yet and did not wake possibly waiting conflicting requests.
+	 * Removed from the tree from "drbd_process_done_ee" within the
+	 * appropriate dw.cb (e_end_block/e_end_resync_block) or from
+	 * _drbd_clear_done_ee.
+	 */
+
+	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
+
+	/* FIXME do we want to detach for failed REQ_DISCARD?
+	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+	if (peer_req->flags & EE_WAS_ERROR)
+		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (block_id == ID_SYNCER)
+		drbd_rs_complete_io(device, i.sector);
+
+	if (do_wake)
+		wake_up(&device->ee_wait);
+
+	if (do_al_complete_io)
+		drbd_al_complete_io(device, &i);
+
+	wake_asender(peer_device->connection);
+	put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_peer_request_endio(struct bio *bio, int error)
+{
+	struct drbd_peer_request *peer_req = bio->bi_private;
+	struct drbd_device *device = peer_req->peer_device->device;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+	int is_write = bio_data_dir(bio) == WRITE;
+	int is_discard = !!(bio->bi_rw & REQ_DISCARD);
+
+	if (error && __ratelimit(&drbd_ratelimit_state))
+		drbd_warn(device, "%s: error=%d s=%llus\n",
+				is_write ? (is_discard ? "discard" : "write")
+					: "read", error,
+				(unsigned long long)peer_req->i.sector);
+	if (!error && !uptodate) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
+					is_write ? "write" : "read",
+					(unsigned long long)peer_req->i.sector);
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	if (error)
+		set_bit(__EE_WAS_ERROR, &peer_req->flags);
+
+	bio_put(bio); /* no need for the bio anymore */
+	if (atomic_dec_and_test(&peer_req->pending_bios)) {
+		if (is_write)
+			drbd_endio_write_sec_final(peer_req);
+		else
+			drbd_endio_read_sec_final(peer_req);
+	}
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_request_endio(struct bio *bio, int error)
+{
+	unsigned long flags;
+	struct drbd_request *req = bio->bi_private;
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	enum drbd_req_event what;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	if (!error && !uptodate) {
+		drbd_warn(device, "p %s: setting error to -EIO\n",
+			 bio_data_dir(bio) == WRITE ? "write" : "read");
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+
+	/* If this request was aborted locally before,
+	 * but now was completed "successfully",
+	 * chances are that this caused arbitrary data corruption.
+	 *
+	 * "aborting" requests, or force-detaching the disk, is intended for
+	 * completely blocked/hung local backing devices which do no longer
+	 * complete requests at all, not even do error completions.  In this
+	 * situation, usually a hard-reset and failover is the only way out.
+	 *
+	 * By "aborting", basically faking a local error-completion,
+	 * we allow for a more graceful swichover by cleanly migrating services.
+	 * Still the affected node has to be rebooted "soon".
+	 *
+	 * By completing these requests, we allow the upper layers to re-use
+	 * the associated data pages.
+	 *
+	 * If later the local backing device "recovers", and now DMAs some data
+	 * from disk into the original request pages, in the best case it will
+	 * just put random data into unused pages; but typically it will corrupt
+	 * meanwhile completely unrelated data, causing all sorts of damage.
+	 *
+	 * Which means delayed successful completion,
+	 * especially for READ requests,
+	 * is a reason to panic().
+	 *
+	 * We assume that a delayed *error* completion is OK,
+	 * though we still will complain noisily about it.
+	 */
+	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+		if (!error)
+			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+	}
+
+	/* to avoid recursion in __req_mod */
+	if (unlikely(error)) {
+		if (bio->bi_rw & REQ_DISCARD)
+			what = (error == -EOPNOTSUPP)
+				? DISCARD_COMPLETED_NOTSUPP
+				: DISCARD_COMPLETED_WITH_ERROR;
+		else
+			what = (bio_data_dir(bio) == WRITE)
+			? WRITE_COMPLETED_WITH_ERROR
+			: (bio_rw(bio) == READ)
+			  ? READ_COMPLETED_WITH_ERROR
+			  : READ_AHEAD_COMPLETED_WITH_ERROR;
+	} else
+		what = COMPLETED_OK;
+
+	bio_put(req->private_bio);
+	req->private_bio = ERR_PTR(error);
+
+	/* not req_mod(), we need irqsave here! */
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	__req_mod(req, what, &m);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	put_ldev(device);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+}
+
+void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct page *page = peer_req->pages;
+	struct page *tmp;
+	unsigned len;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	while ((tmp = page_chain_next(page))) {
+		/* all but the last page will be fully used */
+		sg_set_page(&sg, page, PAGE_SIZE, 0);
+		crypto_hash_update(&desc, &sg, sg.length);
+		page = tmp;
+	}
+	/* and now the last, possibly only partially used page */
+	len = peer_req->i.size & (PAGE_SIZE - 1);
+	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+	crypto_hash_update(&desc, &sg, sg.length);
+	crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	bio_for_each_segment(bvec, bio, iter) {
+		sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
+		crypto_hash_update(&desc, &sg, sg.length);
+	}
+	crypto_hash_final(&desc, digest);
+}
+
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int digest_size;
+	void *digest;
+	int err = 0;
+
+	if (unlikely(cancel))
+		goto out;
+
+	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (digest) {
+		sector_t sector = peer_req->i.sector;
+		unsigned int size = peer_req->i.size;
+		drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+		/* Free peer_req and pages before send.
+		 * In case we block on congestion, we could otherwise run into
+		 * some distributed deadlock, if the other side blocks on
+		 * congestion as well, because our receiver blocks in
+		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+		drbd_free_peer_req(device, peer_req);
+		peer_req = NULL;
+		inc_rs_pending(device);
+		err = drbd_send_drequest_csum(peer_device, sector, size,
+					      digest, digest_size,
+					      P_CSUM_RS_REQUEST);
+		kfree(digest);
+	} else {
+		drbd_err(device, "kmalloc() of digest failed.\n");
+		err = -ENOMEM;
+	}
+
+out:
+	if (peer_req)
+		drbd_free_peer_req(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
+	return err;
+}
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+
+	if (!get_ldev(device))
+		return -EIO;
+
+	/* GFP_TRY, because if there is no memory available right now, this may
+	 * be rescheduled for later. It is "only" background resync, after all. */
+	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
+				       size, true /* has real payload */, GFP_TRY);
+	if (!peer_req)
+		goto defer;
+
+	peer_req->w.cb = w_e_send_csum;
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	atomic_add(size >> 9, &device->rs_sect_ev);
+	if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
+		return 0;
+
+	/* If it failed because of ENOMEM, retry should help.  If it failed
+	 * because bio_add_page failed (probably broken lower level driver),
+	 * retry may or may not help.
+	 * If it does not, you may need to force disconnect. */
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	drbd_free_peer_req(device, peer_req);
+defer:
+	put_ldev(device);
+	return -EAGAIN;
+}
+
+int w_resync_timer(struct drbd_work *w, int cancel)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, resync_work);
+
+	switch (device->state.conn) {
+	case C_VERIFY_S:
+		make_ov_request(device, cancel);
+		break;
+	case C_SYNC_TARGET:
+		make_resync_request(device, cancel);
+		break;
+	}
+
+	return 0;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+	struct drbd_device *device = (struct drbd_device *) data;
+
+	drbd_queue_work_if_unqueued(
+		&first_peer_device(device)->connection->sender_work,
+		&device->resync_work);
+}
+
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+	int ov;
+
+	ov = fb->values[fb->head_index];
+	fb->values[fb->head_index++] = value;
+
+	if (fb->head_index >= fb->size)
+		fb->head_index = 0;
+
+	return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] += value;
+}
+
+struct fifo_buffer *fifo_alloc(int fifo_size)
+{
+	struct fifo_buffer *fb;
+
+	fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+	if (!fb)
+		return NULL;
+
+	fb->head_index = 0;
+	fb->size = fifo_size;
+	fb->total = 0;
+
+	return fb;
+}
+
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
+{
+	struct disk_conf *dc;
+	unsigned int want;     /* The number of sectors we want in-flight */
+	int req_sect; /* Number of sectors to request in this turn */
+	int correction; /* Number of sectors more we need in-flight */
+	int cps; /* correction per invocation of drbd_rs_controller() */
+	int steps; /* Number of time steps to plan ahead */
+	int curr_corr;
+	int max_sect;
+	struct fifo_buffer *plan;
+
+	dc = rcu_dereference(device->ldev->disk_conf);
+	plan = rcu_dereference(device->rs_plan_s);
+
+	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+	if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
+		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
+	} else { /* normal path */
+		want = dc->c_fill_target ? dc->c_fill_target :
+			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
+	}
+
+	correction = want - device->rs_in_flight - plan->total;
+
+	/* Plan ahead */
+	cps = correction / steps;
+	fifo_add_val(plan, cps);
+	plan->total += cps * steps;
+
+	/* What we do in this step */
+	curr_corr = fifo_push(plan, 0);
+	plan->total -= curr_corr;
+
+	req_sect = sect_in + curr_corr;
+	if (req_sect < 0)
+		req_sect = 0;
+
+	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
+	if (req_sect > max_sect)
+		req_sect = max_sect;
+
+	/*
+	drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+		 sect_in, device->rs_in_flight, want, correction,
+		 steps, cps, device->rs_planed, curr_corr, req_sect);
+	*/
+
+	return req_sect;
+}
+
+static int drbd_rs_number_requests(struct drbd_device *device)
+{
+	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+	int number, mxb;
+
+	sect_in = atomic_xchg(&device->rs_sect_in, 0);
+	device->rs_in_flight -= sect_in;
+
+	rcu_read_lock();
+	mxb = drbd_get_max_buffers(device) / 2;
+	if (rcu_dereference(device->rs_plan_s)->size) {
+		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
+		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+	} else {
+		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
+		number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+	}
+	rcu_read_unlock();
+
+	/* Don't have more than "max-buffers"/2 in-flight.
+	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+	 * potentially causing a distributed deadlock on congestion during
+	 * online-verify or (checksum-based) resync, if max-buffers,
+	 * socket buffer sizes and resync rate settings are mis-configured. */
+
+	/* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+	 * mxb (as used here, and in drbd_alloc_pages on the peer) is
+	 * "number of pages" (typically also 4k),
+	 * but "rs_in_flight" is in "sectors" (512 Byte). */
+	if (mxb - device->rs_in_flight/8 < number)
+		number = mxb - device->rs_in_flight/8;
+
+	return number;
+}
+
+static int make_resync_request(struct drbd_device *const device, int cancel)
+{
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+	unsigned long bit;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(device->this_bdev);
+	int max_bio_size;
+	int number, rollback_i, size;
+	int align, requeue = 0;
+	int i = 0;
+
+	if (unlikely(cancel))
+		return 0;
+
+	if (device->rs_total == 0) {
+		/* empty resync? */
+		drbd_resync_finished(device);
+		return 0;
+	}
+
+	if (!get_ldev(device)) {
+		/* Since we only need to access device->rsync a
+		   get_ldev_if_state(device,D_FAILED) would be sufficient, but
+		   to continue resync with a broken disk makes no sense at
+		   all */
+		drbd_err(device, "Disk broke down during resync!\n");
+		return 0;
+	}
+
+	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
+	number = drbd_rs_number_requests(device);
+	if (number <= 0)
+		goto requeue;
+
+	for (i = 0; i < number; i++) {
+		/* Stop generating RS requests when half of the send buffer is filled,
+		 * but notify TCP that we'd like to have more space. */
+		mutex_lock(&connection->data.mutex);
+		if (connection->data.socket) {
+			struct sock *sk = connection->data.socket->sk;
+			int queued = sk->sk_wmem_queued;
+			int sndbuf = sk->sk_sndbuf;
+			if (queued > sndbuf / 2) {
+				requeue = 1;
+				if (sk->sk_socket)
+					set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			}
+		} else
+			requeue = 1;
+		mutex_unlock(&connection->data.mutex);
+		if (requeue)
+			goto requeue;
+
+next_sector:
+		size = BM_BLOCK_SIZE;
+		bit  = drbd_bm_find_next(device, device->bm_resync_fo);
+
+		if (bit == DRBD_END_OF_BITMAP) {
+			device->bm_resync_fo = drbd_bm_bits(device);
+			put_ldev(device);
+			return 0;
+		}
+
+		sector = BM_BIT_TO_SECT(bit);
+
+		if (drbd_try_rs_begin_io(device, sector)) {
+			device->bm_resync_fo = bit;
+			goto requeue;
+		}
+		device->bm_resync_fo = bit + 1;
+
+		if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
+			drbd_rs_complete_io(device, sector);
+			goto next_sector;
+		}
+
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
+		/* try to find some adjacent bits.
+		 * we stop if we have already the maximum req size.
+		 *
+		 * Additionally always align bigger requests, in order to
+		 * be prepared for all stripe sizes of software RAIDs.
+		 */
+		align = 1;
+		rollback_i = i;
+		while (i < number) {
+			if (size + BM_BLOCK_SIZE > max_bio_size)
+				break;
+
+			/* Be always aligned */
+			if (sector & ((1<<(align+3))-1))
+				break;
+
+			/* do not cross extent boundaries */
+			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+				break;
+			/* now, is it actually dirty, after all?
+			 * caution, drbd_bm_test_bit is tri-state for some
+			 * obscure reason; ( b == 0 ) would get the out-of-band
+			 * only accidentally right because of the "oddly sized"
+			 * adjustment below */
+			if (drbd_bm_test_bit(device, bit+1) != 1)
+				break;
+			bit++;
+			size += BM_BLOCK_SIZE;
+			if ((BM_BLOCK_SIZE << align) <= size)
+				align++;
+			i++;
+		}
+		/* if we merged some,
+		 * reset the offset to start the next drbd_bm_find_next from */
+		if (size > BM_BLOCK_SIZE)
+			device->bm_resync_fo = bit + 1;
+#endif
+
+		/* adjust very last sectors, in case we are oddly sized */
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+
+		if (device->use_csums) {
+			switch (read_for_csum(peer_device, sector, size)) {
+			case -EIO: /* Disk failure */
+				put_ldev(device);
+				return -EIO;
+			case -EAGAIN: /* allocation failed, or ldev busy */
+				drbd_rs_complete_io(device, sector);
+				device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+				i = rollback_i;
+				goto requeue;
+			case 0:
+				/* everything ok */
+				break;
+			default:
+				BUG();
+			}
+		} else {
+			int err;
+
+			inc_rs_pending(device);
+			err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
+						 sector, size, ID_SYNCER);
+			if (err) {
+				drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
+				dec_rs_pending(device);
+				put_ldev(device);
+				return err;
+			}
+		}
+	}
+
+	if (device->bm_resync_fo >= drbd_bm_bits(device)) {
+		/* last syncer _request_ was sent,
+		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
+		 * next sync group will resume), as soon as we receive the last
+		 * resync data block, and the last bit is cleared.
+		 * until then resync "work" is "inactive" ...
+		 */
+		put_ldev(device);
+		return 0;
+	}
+
+ requeue:
+	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+	mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+	put_ldev(device);
+	return 0;
+}
+
+static int make_ov_request(struct drbd_device *device, int cancel)
+{
+	int number, i, size;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(device->this_bdev);
+	bool stop_sector_reached = false;
+
+	if (unlikely(cancel))
+		return 1;
+
+	number = drbd_rs_number_requests(device);
+
+	sector = device->ov_position;
+	for (i = 0; i < number; i++) {
+		if (sector >= capacity)
+			return 1;
+
+		/* We check for "finished" only in the reply path:
+		 * w_e_end_ov_reply().
+		 * We need to send at least one request out. */
+		stop_sector_reached = i > 0
+			&& verify_can_do_stop_sector(device)
+			&& sector >= device->ov_stop_sector;
+		if (stop_sector_reached)
+			break;
+
+		size = BM_BLOCK_SIZE;
+
+		if (drbd_try_rs_begin_io(device, sector)) {
+			device->ov_position = sector;
+			goto requeue;
+		}
+
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+
+		inc_rs_pending(device);
+		if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
+			dec_rs_pending(device);
+			return 0;
+		}
+		sector += BM_SECT_PER_BIT;
+	}
+	device->ov_position = sector;
+
+ requeue:
+	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+	if (i == 0 || !stop_sector_reached)
+		mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+	return 1;
+}
+
+int w_ov_finished(struct drbd_work *w, int cancel)
+{
+	struct drbd_device_work *dw =
+		container_of(w, struct drbd_device_work, w);
+	struct drbd_device *device = dw->device;
+	kfree(dw);
+	ov_out_of_sync_print(device);
+	drbd_resync_finished(device);
+
+	return 0;
+}
+
+static int w_resync_finished(struct drbd_work *w, int cancel)
+{
+	struct drbd_device_work *dw =
+		container_of(w, struct drbd_device_work, w);
+	struct drbd_device *device = dw->device;
+	kfree(dw);
+
+	drbd_resync_finished(device);
+
+	return 0;
+}
+
+static void ping_peer(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+
+	clear_bit(GOT_PING_ACK, &connection->flags);
+	request_ping(connection);
+	wait_event(connection->ping_wait,
+		   test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
+}
+
+int drbd_resync_finished(struct drbd_device *device)
+{
+	unsigned long db, dt, dbdt;
+	unsigned long n_oos;
+	union drbd_state os, ns;
+	struct drbd_device_work *dw;
+	char *khelper_cmd = NULL;
+	int verify_done = 0;
+
+	/* Remove all elements from the resync LRU. Since future actions
+	 * might set bits in the (main) bitmap, then the entries in the
+	 * resync LRU would be wrong. */
+	if (drbd_rs_del_all(device)) {
+		/* In case this is not possible now, most probably because
+		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
+		 * queue (or even the read operations for those packets
+		 * is not finished by now).   Retry in 100ms. */
+
+		schedule_timeout_interruptible(HZ / 10);
+		dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
+		if (dw) {
+			dw->w.cb = w_resync_finished;
+			dw->device = device;
+			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+					&dw->w);
+			return 1;
+		}
+		drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
+	}
+
+	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+	if (dt <= 0)
+		dt = 1;
+
+	db = device->rs_total;
+	/* adjust for verify start and stop sectors, respective reached position */
+	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+		db -= device->ov_left;
+
+	dbdt = Bit2KB(db/dt);
+	device->rs_paused /= HZ;
+
+	if (!get_ldev(device))
+		goto out;
+
+	ping_peer(device);
+
+	spin_lock_irq(&device->resource->req_lock);
+	os = drbd_read_state(device);
+
+	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
+
+	/* This protects us against multiple calls (that can happen in the presence
+	   of application IO), and against connectivity loss just before we arrive here. */
+	if (os.conn <= C_CONNECTED)
+		goto out_unlock;
+
+	ns = os;
+	ns.conn = C_CONNECTED;
+
+	drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+	     verify_done ? "Online verify" : "Resync",
+	     dt + device->rs_paused, device->rs_paused, dbdt);
+
+	n_oos = drbd_bm_total_weight(device);
+
+	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+		if (n_oos) {
+			drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
+			      n_oos, Bit2KB(1));
+			khelper_cmd = "out-of-sync";
+		}
+	} else {
+		D_ASSERT(device, (n_oos - device->rs_failed) == 0);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+			khelper_cmd = "after-resync-target";
+
+		if (device->use_csums && device->rs_total) {
+			const unsigned long s = device->rs_same_csum;
+			const unsigned long t = device->rs_total;
+			const int ratio =
+				(t == 0)     ? 0 :
+			(t < 100000) ? ((s*100)/t) : (s/(t/100));
+			drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
+			     "transferred %luK total %luK\n",
+			     ratio,
+			     Bit2KB(device->rs_same_csum),
+			     Bit2KB(device->rs_total - device->rs_same_csum),
+			     Bit2KB(device->rs_total));
+		}
+	}
+
+	if (device->rs_failed) {
+		drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			ns.disk = D_INCONSISTENT;
+			ns.pdsk = D_UP_TO_DATE;
+		} else {
+			ns.disk = D_UP_TO_DATE;
+			ns.pdsk = D_INCONSISTENT;
+		}
+	} else {
+		ns.disk = D_UP_TO_DATE;
+		ns.pdsk = D_UP_TO_DATE;
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			if (device->p_uuid) {
+				int i;
+				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+					_drbd_uuid_set(device, i, device->p_uuid[i]);
+				drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
+				_drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
+			} else {
+				drbd_err(device, "device->p_uuid is NULL! BUG\n");
+			}
+		}
+
+		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+			/* for verify runs, we don't update uuids here,
+			 * so there would be nothing to report. */
+			drbd_uuid_set_bm(device, 0UL);
+			drbd_print_uuids(device, "updated UUIDs");
+			if (device->p_uuid) {
+				/* Now the two UUID sets are equal, update what we
+				 * know of the peer. */
+				int i;
+				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+					device->p_uuid[i] = device->ldev->md.uuid[i];
+			}
+		}
+	}
+
+	_drbd_set_state(device, ns, CS_VERBOSE, NULL);
+out_unlock:
+	spin_unlock_irq(&device->resource->req_lock);
+	put_ldev(device);
+out:
+	device->rs_total  = 0;
+	device->rs_failed = 0;
+	device->rs_paused = 0;
+
+	/* reset start sector, if we reached end of device */
+	if (verify_done && device->ov_left == 0)
+		device->ov_start_sector = 0;
+
+	drbd_md_sync(device);
+
+	if (khelper_cmd)
+		drbd_khelper(device, khelper_cmd);
+
+	return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+	if (drbd_peer_req_has_active_page(peer_req)) {
+		/* This might happen if sendpage() has not finished */
+		int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
+		atomic_add(i, &device->pp_in_use_by_net);
+		atomic_sub(i, &device->pp_in_use);
+		spin_lock_irq(&device->resource->req_lock);
+		list_add_tail(&peer_req->w.list, &device->net_ee);
+		spin_unlock_irq(&device->resource->req_lock);
+		wake_up(&drbd_pp_wait);
+	} else
+		drbd_free_peer_req(device, peer_req);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @device:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegDReply. sector=%llus.\n",
+			    (unsigned long long)peer_req->i.sector);
+
+		err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
+	}
+
+	dec_unacked(device);
+
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block() failed\n");
+	return err;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	if (device->state.conn == C_AHEAD) {
+		err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
+	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		if (likely(device->state.pdsk >= D_INCONSISTENT)) {
+			inc_rs_pending(device);
+			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+		} else {
+			if (__ratelimit(&drbd_ratelimit_state))
+				drbd_err(device, "Not sending RSDataReply, "
+				    "partner DISKLESS!\n");
+			err = 0;
+		}
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
+			    (unsigned long long)peer_req->i.sector);
+
+		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+
+		/* update resync data with failure */
+		drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
+	}
+
+	dec_unacked(device);
+
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block() failed\n");
+	return err;
+}
+
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct digest_info *di;
+	int digest_size;
+	void *digest = NULL;
+	int err, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	di = peer_req->digest;
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		/* quick hack to try to avoid a race against reconfiguration.
+		 * a real fix would be much more involved,
+		 * introducing more locking mechanisms */
+		if (peer_device->connection->csums_tfm) {
+			digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+			D_ASSERT(device, digest_size == di->digest_size);
+			digest = kmalloc(digest_size, GFP_NOIO);
+		}
+		if (digest) {
+			drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+
+		if (eq) {
+			drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
+			/* rs_same_csums unit is BM_BLOCK_SIZE */
+			device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+			err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
+		} else {
+			inc_rs_pending(device);
+			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
+			kfree(di);
+			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+		}
+	} else {
+		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
+	}
+
+	dec_unacked(device);
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block/ack() failed\n");
+	return err;
+}
+
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	unsigned int size = peer_req->i.size;
+	int digest_size;
+	void *digest;
+	int err = 0;
+
+	if (unlikely(cancel))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (!digest) {
+		err = 1;	/* terminate the connection in case the allocation failed */
+		goto out;
+	}
+
+	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+		drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+	else
+		memset(digest, 0, digest_size);
+
+	/* Free e and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+	drbd_free_peer_req(device, peer_req);
+	peer_req = NULL;
+	inc_rs_pending(device);
+	err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
+	if (err)
+		dec_rs_pending(device);
+	kfree(digest);
+
+out:
+	if (peer_req)
+		drbd_free_peer_req(device, peer_req);
+	dec_unacked(device);
+	return err;
+}
+
+void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
+{
+	if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
+		device->ov_last_oos_size += size>>9;
+	} else {
+		device->ov_last_oos_start = sector;
+		device->ov_last_oos_size = size>>9;
+	}
+	drbd_set_out_of_sync(device, sector, size);
+}
+
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct digest_info *di;
+	void *digest;
+	sector_t sector = peer_req->i.sector;
+	unsigned int size = peer_req->i.size;
+	int digest_size;
+	int err, eq = 0;
+	bool stop_sector_reached = false;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+	 * the resync lru has been cleaned up already */
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	di = peer_req->digest;
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+		digest = kmalloc(digest_size, GFP_NOIO);
+		if (digest) {
+			drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+
+			D_ASSERT(device, digest_size == di->digest_size);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+	}
+
+	/* Free peer_req and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+	drbd_free_peer_req(device, peer_req);
+	if (!eq)
+		drbd_ov_out_of_sync_found(device, sector, size);
+	else
+		ov_out_of_sync_print(device);
+
+	err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
+			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+	dec_unacked(device);
+
+	--device->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((device->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(device, device->ov_left);
+
+	stop_sector_reached = verify_can_do_stop_sector(device) &&
+		(sector + (size>>9)) >= device->ov_stop_sector;
+
+	if (device->ov_left == 0 || stop_sector_reached) {
+		ov_out_of_sync_print(device);
+		drbd_resync_finished(device);
+	}
+
+	return err;
+}
+
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+static int drbd_send_barrier(struct drbd_connection *connection)
+{
+	struct p_barrier *p;
+	struct drbd_socket *sock;
+
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	p->barrier = connection->send.current_epoch_nr;
+	p->pad = 0;
+	connection->send.current_epoch_writes = 0;
+
+	return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
+}
+
+int w_send_write_hint(struct drbd_work *w, int cancel)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, unplug_work);
+	struct drbd_socket *sock;
+
+	if (cancel)
+		return 0;
+	sock = &first_peer_device(device)->connection->data;
+	if (!drbd_prepare_command(first_peer_device(device), sock))
+		return -EIO;
+	return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
+}
+
+static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
+{
+	if (!connection->send.seen_any_write_yet) {
+		connection->send.seen_any_write_yet = true;
+		connection->send.current_epoch_nr = epoch;
+		connection->send.current_epoch_writes = 0;
+	}
+}
+
+static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
+{
+	/* re-init if first write on this connection */
+	if (!connection->send.seen_any_write_yet)
+		return;
+	if (connection->send.current_epoch_nr != epoch) {
+		if (connection->send.current_epoch_writes)
+			drbd_send_barrier(connection);
+		connection->send.current_epoch_nr = epoch;
+	}
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device->connection;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+	req->pre_send_jif = jiffies;
+
+	/* this time, no connection->send.current_epoch_writes++;
+	 * If it was sent, it was the closing barrier for the last
+	 * replicated epoch, before we went into AHEAD mode.
+	 * No more barriers will be sent, until we leave AHEAD mode again. */
+	maybe_send_barrier(connection, req->epoch);
+
+	err = drbd_send_out_of_sync(peer_device, req);
+	req_mod(req, OOS_HANDED_TO_NETWORK);
+
+	return err;
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device->connection;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+	req->pre_send_jif = jiffies;
+
+	re_init_if_first_write(connection, req->epoch);
+	maybe_send_barrier(connection, req->epoch);
+	connection->send.current_epoch_writes++;
+
+	err = drbd_send_dblock(peer_device, req);
+	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+	return err;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device->connection;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+	req->pre_send_jif = jiffies;
+
+	/* Even read requests may close a write epoch,
+	 * if there was any yet. */
+	maybe_send_barrier(connection, req->epoch);
+
+	err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
+				 (unsigned long)req);
+
+	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+	return err;
+}
+
+int w_restart_disk_io(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+
+	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+		drbd_al_begin_io(device, &req->i);
+
+	drbd_req_make_private_bio(req, req->master_bio);
+	req->private_bio->bi_bdev = device->ldev->backing_bdev;
+	generic_make_request(req->private_bio);
+
+	return 0;
+}
+
+static int _drbd_may_sync_now(struct drbd_device *device)
+{
+	struct drbd_device *odev = device;
+	int resync_after;
+
+	while (1) {
+		if (!odev->ldev || odev->state.disk == D_DISKLESS)
+			return 1;
+		rcu_read_lock();
+		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+		rcu_read_unlock();
+		if (resync_after == -1)
+			return 1;
+		odev = minor_to_device(resync_after);
+		if (!odev)
+			return 1;
+		if ((odev->state.conn >= C_SYNC_SOURCE &&
+		     odev->state.conn <= C_PAUSED_SYNC_T) ||
+		    odev->state.aftr_isp || odev->state.peer_isp ||
+		    odev->state.user_isp)
+			return 0;
+	}
+}
+
+/**
+ * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @device:	DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+static int _drbd_pause_after(struct drbd_device *device)
+{
+	struct drbd_device *odev;
+	int i, rv = 0;
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, odev, i) {
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (!_drbd_may_sync_now(odev))
+			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+			       != SS_NOTHING_TO_DO);
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/**
+ * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * @device:	DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+static int _drbd_resume_next(struct drbd_device *device)
+{
+	struct drbd_device *odev;
+	int i, rv = 0;
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, odev, i) {
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (odev->state.aftr_isp) {
+			if (_drbd_may_sync_now(odev))
+				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+							CS_HARD, NULL)
+				       != SS_NOTHING_TO_DO) ;
+		}
+	}
+	rcu_read_unlock();
+	return rv;
+}
+
+void resume_next_sg(struct drbd_device *device)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_resume_next(device);
+	write_unlock_irq(&global_state_lock);
+}
+
+void suspend_other_sg(struct drbd_device *device)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_pause_after(device);
+	write_unlock_irq(&global_state_lock);
+}
+
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
+{
+	struct drbd_device *odev;
+	int resync_after;
+
+	if (o_minor == -1)
+		return NO_ERROR;
+	if (o_minor < -1 || o_minor > MINORMASK)
+		return ERR_RESYNC_AFTER;
+
+	/* check for loops */
+	odev = minor_to_device(o_minor);
+	while (1) {
+		if (odev == device)
+			return ERR_RESYNC_AFTER_CYCLE;
+
+		/* You are free to depend on diskless, non-existing,
+		 * or not yet/no longer existing minors.
+		 * We only reject dependency loops.
+		 * We cannot follow the dependency chain beyond a detached or
+		 * missing minor.
+		 */
+		if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+			return NO_ERROR;
+
+		rcu_read_lock();
+		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+		rcu_read_unlock();
+		/* dependency chain ends here, no cycles. */
+		if (resync_after == -1)
+			return NO_ERROR;
+
+		/* follow the dependency chain */
+		odev = minor_to_device(resync_after);
+	}
+}
+
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_device *device)
+{
+	int changes;
+
+	do {
+		changes  = _drbd_pause_after(device);
+		changes |= _drbd_resume_next(device);
+	} while (changes);
+}
+
+void drbd_rs_controller_reset(struct drbd_device *device)
+{
+	struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+	struct fifo_buffer *plan;
+
+	atomic_set(&device->rs_sect_in, 0);
+	atomic_set(&device->rs_sect_ev, 0);
+	device->rs_in_flight = 0;
+	device->rs_last_events =
+		(int)part_stat_read(&disk->part0, sectors[0]) +
+		(int)part_stat_read(&disk->part0, sectors[1]);
+
+	/* Updating the RCU protected object in place is necessary since
+	   this function gets called from atomic context.
+	   It is valid since all other updates also lead to an completely
+	   empty fifo */
+	rcu_read_lock();
+	plan = rcu_dereference(device->rs_plan_s);
+	plan->total = 0;
+	fifo_set(plan, 0);
+	rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+	struct drbd_device *device = (struct drbd_device *) data;
+	drbd_device_post_work(device, RS_START);
+}
+
+static void do_start_resync(struct drbd_device *device)
+{
+	if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
+		drbd_warn(device, "postponing start_resync ...\n");
+		device->start_resync_timer.expires = jiffies + HZ/10;
+		add_timer(&device->start_resync_timer);
+		return;
+	}
+
+	drbd_start_resync(device, C_SYNC_SOURCE);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+	bool csums_after_crash_only;
+	rcu_read_lock();
+	csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+	rcu_read_unlock();
+	return connection->agreed_pro_version >= 89 &&		/* supported? */
+		connection->csums_tfm &&			/* configured? */
+		(csums_after_crash_only == 0			/* use for each resync? */
+		 || test_bit(CRASHED_PRIMARY, &device->flags));	/* or only after Primary crash? */
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @device:	DRBD device.
+ * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
+{
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	union drbd_state ns;
+	int r;
+
+	if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
+		drbd_err(device, "Resync already running!\n");
+		return;
+	}
+
+	if (!test_bit(B_RS_H_DONE, &device->flags)) {
+		if (side == C_SYNC_TARGET) {
+			/* Since application IO was locked out during C_WF_BITMAP_T and
+			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+			   we check that we might make the data inconsistent. */
+			r = drbd_khelper(device, "before-resync-target");
+			r = (r >> 8) & 0xff;
+			if (r > 0) {
+				drbd_info(device, "before-resync-target handler returned %d, "
+					 "dropping connection.\n", r);
+				conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+				return;
+			}
+		} else /* C_SYNC_SOURCE */ {
+			r = drbd_khelper(device, "before-resync-source");
+			r = (r >> 8) & 0xff;
+			if (r > 0) {
+				if (r == 3) {
+					drbd_info(device, "before-resync-source handler returned %d, "
+						 "ignoring. Old userland tools?", r);
+				} else {
+					drbd_info(device, "before-resync-source handler returned %d, "
+						 "dropping connection.\n", r);
+					conn_request_state(connection,
+							   NS(conn, C_DISCONNECTING), CS_HARD);
+					return;
+				}
+			}
+		}
+	}
+
+	if (current == connection->worker.task) {
+		/* The worker should not sleep waiting for state_mutex,
+		   that can take long */
+		if (!mutex_trylock(device->state_mutex)) {
+			set_bit(B_RS_H_DONE, &device->flags);
+			device->start_resync_timer.expires = jiffies + HZ/5;
+			add_timer(&device->start_resync_timer);
+			return;
+		}
+	} else {
+		mutex_lock(device->state_mutex);
+	}
+	clear_bit(B_RS_H_DONE, &device->flags);
+
+	/* req_lock: serialize with drbd_send_and_submit() and others
+	 * global_state_lock: for stable sync-after dependencies */
+	spin_lock_irq(&device->resource->req_lock);
+	write_lock(&global_state_lock);
+	/* Did some connection breakage or IO error race with us? */
+	if (device->state.conn < C_CONNECTED
+	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
+		write_unlock(&global_state_lock);
+		spin_unlock_irq(&device->resource->req_lock);
+		mutex_unlock(device->state_mutex);
+		return;
+	}
+
+	ns = drbd_read_state(device);
+
+	ns.aftr_isp = !_drbd_may_sync_now(device);
+
+	ns.conn = side;
+
+	if (side == C_SYNC_TARGET)
+		ns.disk = D_INCONSISTENT;
+	else /* side == C_SYNC_SOURCE */
+		ns.pdsk = D_INCONSISTENT;
+
+	r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+	ns = drbd_read_state(device);
+
+	if (ns.conn < C_CONNECTED)
+		r = SS_UNKNOWN_ERROR;
+
+	if (r == SS_SUCCESS) {
+		unsigned long tw = drbd_bm_total_weight(device);
+		unsigned long now = jiffies;
+		int i;
+
+		device->rs_failed    = 0;
+		device->rs_paused    = 0;
+		device->rs_same_csum = 0;
+		device->rs_last_sect_ev = 0;
+		device->rs_total     = tw;
+		device->rs_start     = now;
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			device->rs_mark_left[i] = tw;
+			device->rs_mark_time[i] = now;
+		}
+		_drbd_pause_after(device);
+		/* Forget potentially stale cached per resync extent bit-counts.
+		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
+		 * disabled, and know the disk state is ok. */
+		spin_lock(&device->al_lock);
+		lc_reset(device->resync);
+		device->resync_locked = 0;
+		device->resync_wenr = LC_FREE;
+		spin_unlock(&device->al_lock);
+	}
+	write_unlock(&global_state_lock);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (r == SS_SUCCESS) {
+		wake_up(&device->al_wait); /* for lc_reset() above */
+		/* reset rs_last_bcast when a resync or verify is started,
+		 * to deal with potential jiffies wrap. */
+		device->rs_last_bcast = jiffies - HZ;
+
+		drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+		     drbd_conn_str(ns.conn),
+		     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
+		     (unsigned long) device->rs_total);
+		if (side == C_SYNC_TARGET) {
+			device->bm_resync_fo = 0;
+			device->use_csums = use_checksum_based_resync(connection, device);
+		} else {
+			device->use_csums = 0;
+		}
+
+		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+		 * with w_send_oos, or the sync target will get confused as to
+		 * how much bits to resync.  We cannot do that always, because for an
+		 * empty resync and protocol < 95, we need to do it here, as we call
+		 * drbd_resync_finished from here in that case.
+		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+		 * and from after_state_ch otherwise. */
+		if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+			drbd_gen_and_send_sync_uuid(peer_device);
+
+		if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
+			/* This still has a race (about when exactly the peers
+			 * detect connection loss) that can lead to a full sync
+			 * on next handshake. In 8.3.9 we fixed this with explicit
+			 * resync-finished notifications, but the fix
+			 * introduces a protocol change.  Sleeping for some
+			 * time longer than the ping interval + timeout on the
+			 * SyncSource, to give the SyncTarget the chance to
+			 * detect connection loss, then waiting for a ping
+			 * response (implicit in drbd_resync_finished) reduces
+			 * the race considerably, but does not solve it. */
+			if (side == C_SYNC_SOURCE) {
+				struct net_conf *nc;
+				int timeo;
+
+				rcu_read_lock();
+				nc = rcu_dereference(connection->net_conf);
+				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+				rcu_read_unlock();
+				schedule_timeout_interruptible(timeo);
+			}
+			drbd_resync_finished(device);
+		}
+
+		drbd_rs_controller_reset(device);
+		/* ns.conn may already be != device->state.conn,
+		 * we may have been paused in between, or become paused until
+		 * the timer triggers.
+		 * No matter, that is handled in resync_timer_fn() */
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&device->resync_timer, jiffies);
+
+		drbd_md_sync(device);
+	}
+	put_ldev(device);
+	mutex_unlock(device->state_mutex);
+}
+
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
+{
+	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+	device->rs_last_bcast = jiffies;
+
+	if (!get_ldev(device))
+		return;
+
+	drbd_bm_write_lazy(device, 0);
+	if (resync_done && is_sync_state(device->state.conn))
+		drbd_resync_finished(device);
+
+	drbd_bcast_event(device, &sib);
+	/* update timestamp, in case it took a while to write out stuff */
+	device->rs_last_bcast = jiffies;
+	put_ldev(device);
+}
+
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+	lc_destroy(device->resync);
+	device->resync = NULL;
+	lc_destroy(device->act_log);
+	device->act_log = NULL;
+
+	__acquire(local);
+	drbd_free_ldev(device->ldev);
+	device->ldev = NULL;
+	__release(local);
+
+	clear_bit(GOING_DISKLESS, &device->flags);
+	wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+	D_ASSERT(device, device->state.disk == D_FAILED);
+	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+	 * the protected members anymore, though, so once put_ldev reaches zero
+	 * again, it will be safe to free them. */
+
+	/* Try to write changed bitmap pages, read errors may have just
+	 * set some bits outside the area covered by the activity log.
+	 *
+	 * If we have an IO error during the bitmap writeout,
+	 * we will want a full sync next time, just in case.
+	 * (Do we want a specific meta data flag for this?)
+	 *
+	 * If that does not make it to stable storage either,
+	 * we cannot do anything about that anymore.
+	 *
+	 * We still need to check if both bitmap and ldev are present, we may
+	 * end up here after a failed attach, before ldev was even assigned.
+	 */
+	if (device->bitmap && device->ldev) {
+		/* An interrupted resync or similar is allowed to recounts bits
+		 * while we detach.
+		 * Any modifications would not be expected anymore, though.
+		 */
+		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+					"detach", BM_LOCKED_TEST_ALLOWED)) {
+			if (test_bit(WAS_READ_ERROR, &device->flags)) {
+				drbd_md_set_flag(device, MDF_FULL_SYNC);
+				drbd_md_sync(device);
+			}
+		}
+	}
+
+	drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+static int do_md_sync(struct drbd_device *device)
+{
+	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+	drbd_md_sync(device);
+	return 0;
+}
+
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+		struct drbd_thread_timing_details *tdp,
+		unsigned int *cb_nr,
+		void *cb,
+		const char *fn, const unsigned int line)
+{
+	unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+	struct drbd_thread_timing_details *td = tdp + i;
+
+	td->start_jif = jiffies;
+	td->cb_addr = cb;
+	td->caller_fn = fn;
+	td->line = line;
+	td->cb_nr = *cb_nr;
+
+	i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+	td = tdp + i;
+	memset(td, 0, sizeof(*td));
+
+	++(*cb_nr);
+}
+
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+	if (test_bit(MD_SYNC, &todo))
+		do_md_sync(device);
+	if (test_bit(RS_DONE, &todo) ||
+	    test_bit(RS_PROGRESS, &todo))
+		update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
+	if (test_bit(GO_DISKLESS, &todo))
+		go_diskless(device);
+	if (test_bit(DESTROY_DISK, &todo))
+		drbd_ldev_destroy(device);
+	if (test_bit(RS_START, &todo))
+		do_start_resync(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK	\
+	((1UL << GO_DISKLESS)	\
+	|(1UL << DESTROY_DISK)	\
+	|(1UL << MD_SYNC)	\
+	|(1UL << RS_START)	\
+	|(1UL << RS_PROGRESS)	\
+	|(1UL << RS_DONE)	\
+	)
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+	unsigned long old, new;
+	do {
+		old = *flags;
+		new = old & ~DRBD_DEVICE_WORK_MASK;
+	} while (cmpxchg(flags, old, new) != old);
+	return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		unsigned long todo = get_work_bits(&device->flags);
+		if (!todo)
+			continue;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		do_device_work(device, todo);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+	spin_lock_irq(&queue->q_lock);
+	list_splice_tail_init(&queue->q, work_list);
+	spin_unlock_irq(&queue->q_lock);
+	return !list_empty(work_list);
+}
+
+static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
+{
+	DEFINE_WAIT(wait);
+	struct net_conf *nc;
+	int uncork, cork;
+
+	dequeue_work_batch(&connection->sender_work, work_list);
+	if (!list_empty(work_list))
+		return;
+
+	/* Still nothing to do?
+	 * Maybe we still need to close the current epoch,
+	 * even if no new requests are queued yet.
+	 *
+	 * Also, poke TCP, just in case.
+	 * Then wait for new work (or signal). */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	uncork = nc ? nc->tcp_cork : 0;
+	rcu_read_unlock();
+	if (uncork) {
+		mutex_lock(&connection->data.mutex);
+		if (connection->data.socket)
+			drbd_tcp_uncork(connection->data.socket);
+		mutex_unlock(&connection->data.mutex);
+	}
+
+	for (;;) {
+		int send_barrier;
+		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_lock_irq(&connection->resource->req_lock);
+		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
+		if (!list_empty(&connection->sender_work.q))
+			list_splice_tail_init(&connection->sender_work.q, work_list);
+		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
+		if (!list_empty(work_list) || signal_pending(current)) {
+			spin_unlock_irq(&connection->resource->req_lock);
+			break;
+		}
+
+		/* We found nothing new to do, no to-be-communicated request,
+		 * no other work item.  We may still need to close the last
+		 * epoch.  Next incoming request epoch will be connection ->
+		 * current transfer log epoch number.  If that is different
+		 * from the epoch of the last request we communicated, it is
+		 * safe to send the epoch separating barrier now.
+		 */
+		send_barrier =
+			atomic_read(&connection->current_tle_nr) !=
+			connection->send.current_epoch_nr;
+		spin_unlock_irq(&connection->resource->req_lock);
+
+		if (send_barrier)
+			maybe_send_barrier(connection,
+					connection->send.current_epoch_nr + 1);
+
+		if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
+			break;
+
+		/* drbd_send() may have called flush_signals() */
+		if (get_t_state(&connection->worker) != RUNNING)
+			break;
+
+		schedule();
+		/* may be woken up for other things but new work, too,
+		 * e.g. if the current epoch got closed.
+		 * In which case we send the barrier above. */
+	}
+	finish_wait(&connection->sender_work.q_wait, &wait);
+
+	/* someone may have changed the config while we have been waiting above. */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	cork = nc ? nc->tcp_cork : 0;
+	rcu_read_unlock();
+	mutex_lock(&connection->data.mutex);
+	if (connection->data.socket) {
+		if (cork)
+			drbd_tcp_cork(connection->data.socket);
+		else if (!uncork)
+			drbd_tcp_uncork(connection->data.socket);
+	}
+	mutex_unlock(&connection->data.mutex);
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	struct drbd_work *w = NULL;
+	struct drbd_peer_device *peer_device;
+	LIST_HEAD(work_list);
+	int vnr;
+
+	while (get_t_state(thi) == RUNNING) {
+		drbd_thread_current_set_cpu(thi);
+
+		if (list_empty(&work_list)) {
+			update_worker_timing_details(connection, wait_for_work);
+			wait_for_work(connection, &work_list);
+		}
+
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+			update_worker_timing_details(connection, do_unqueued_work);
+			do_unqueued_work(connection);
+		}
+
+		if (signal_pending(current)) {
+			flush_signals(current);
+			if (get_t_state(thi) == RUNNING) {
+				drbd_warn(connection, "Worker got an unexpected signal\n");
+				continue;
+			}
+			break;
+		}
+
+		if (get_t_state(thi) != RUNNING)
+			break;
+
+		if (!list_empty(&work_list)) {
+			w = list_first_entry(&work_list, struct drbd_work, list);
+			list_del_init(&w->list);
+			update_worker_timing_details(connection, w->cb);
+			if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
+				continue;
+			if (connection->cstate >= C_WF_REPORT_PARAMS)
+				conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		}
+	}
+
+	do {
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+			update_worker_timing_details(connection, do_unqueued_work);
+			do_unqueued_work(connection);
+		}
+		if (!list_empty(&work_list)) {
+			w = list_first_entry(&work_list, struct drbd_work, list);
+			list_del_init(&w->list);
+			update_worker_timing_details(connection, w->cb);
+			w->cb(w, 1);
+		} else
+			dequeue_work_batch(&connection->sender_work, &work_list);
+	} while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_device_cleanup(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	return 0;
+}