summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /drivers/md
Initial import
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig478
-rw-r--r--drivers/md/Makefile63
-rw-r--r--drivers/md/bcache/Kconfig26
-rw-r--r--drivers/md/bcache/Makefile8
-rw-r--r--drivers/md/bcache/alloc.c696
-rw-r--r--drivers/md/bcache/bcache.h946
-rw-r--r--drivers/md/bcache/bset.c1331
-rw-r--r--drivers/md/bcache/bset.h566
-rw-r--r--drivers/md/bcache/btree.c2530
-rw-r--r--drivers/md/bcache/btree.h310
-rw-r--r--drivers/md/bcache/closure.c222
-rw-r--r--drivers/md/bcache/closure.h386
-rw-r--r--drivers/md/bcache/debug.c252
-rw-r--r--drivers/md/bcache/debug.h34
-rw-r--r--drivers/md/bcache/extents.c625
-rw-r--r--drivers/md/bcache/extents.h14
-rw-r--r--drivers/md/bcache/io.c243
-rw-r--r--drivers/md/bcache/journal.c821
-rw-r--r--drivers/md/bcache/journal.h179
-rw-r--r--drivers/md/bcache/movinggc.c256
-rw-r--r--drivers/md/bcache/request.c1149
-rw-r--r--drivers/md/bcache/request.h43
-rw-r--r--drivers/md/bcache/stats.c241
-rw-r--r--drivers/md/bcache/stats.h61
-rw-r--r--drivers/md/bcache/super.c2120
-rw-r--r--drivers/md/bcache/sysfs.c898
-rw-r--r--drivers/md/bcache/sysfs.h110
-rw-r--r--drivers/md/bcache/trace.c52
-rw-r--r--drivers/md/bcache/util.c381
-rw-r--r--drivers/md/bcache/util.h588
-rw-r--r--drivers/md/bcache/writeback.c513
-rw-r--r--drivers/md/bcache/writeback.h91
-rw-r--r--drivers/md/bitmap.c2442
-rw-r--r--drivers/md/bitmap.h269
-rw-r--r--drivers/md/dm-bio-prison.c396
-rw-r--r--drivers/md/dm-bio-prison.h125
-rw-r--r--drivers/md/dm-bio-record.h40
-rw-r--r--drivers/md/dm-bufio.c1925
-rw-r--r--drivers/md/dm-bufio.h132
-rw-r--r--drivers/md/dm-builtin.c48
-rw-r--r--drivers/md/dm-cache-block-types.h54
-rw-r--r--drivers/md/dm-cache-metadata.c1389
-rw-r--r--drivers/md/dm-cache-metadata.h138
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c468
-rw-r--r--drivers/md/dm-cache-policy-internal.h132
-rw-r--r--drivers/md/dm-cache-policy-mq.c1491
-rw-r--r--drivers/md/dm-cache-policy.c173
-rw-r--r--drivers/md/dm-cache-policy.h262
-rw-r--r--drivers/md/dm-cache-target.c3407
-rw-r--r--drivers/md/dm-crypt.c2080
-rw-r--r--drivers/md/dm-delay.c373
-rw-r--r--drivers/md/dm-era-target.c1747
-rw-r--r--drivers/md/dm-exception-store.c290
-rw-r--r--drivers/md/dm-exception-store.h227
-rw-r--r--drivers/md/dm-flakey.c447
-rw-r--r--drivers/md/dm-io.c540
-rw-r--r--drivers/md/dm-ioctl.c1958
-rw-r--r--drivers/md/dm-kcopyd.c884
-rw-r--r--drivers/md/dm-linear.c182
-rw-r--r--drivers/md/dm-log-userspace-base.c936
-rw-r--r--drivers/md/dm-log-userspace-transfer.c287
-rw-r--r--drivers/md/dm-log-userspace-transfer.h18
-rw-r--r--drivers/md/dm-log-writes.c825
-rw-r--r--drivers/md/dm-log.c888
-rw-r--r--drivers/md/dm-mpath.c1793
-rw-r--r--drivers/md/dm-mpath.h22
-rw-r--r--drivers/md/dm-path-selector.c140
-rw-r--r--drivers/md/dm-path-selector.h97
-rw-r--r--drivers/md/dm-queue-length.c264
-rw-r--r--drivers/md/dm-raid.c1748
-rw-r--r--drivers/md/dm-raid1.c1458
-rw-r--r--drivers/md/dm-region-hash.c724
-rw-r--r--drivers/md/dm-round-robin.c219
-rw-r--r--drivers/md/dm-service-time.c343
-rw-r--r--drivers/md/dm-snap-persistent.c952
-rw-r--r--drivers/md/dm-snap-transient.c153
-rw-r--r--drivers/md/dm-snap.c2486
-rw-r--r--drivers/md/dm-stats.c983
-rw-r--r--drivers/md/dm-stats.h40
-rw-r--r--drivers/md/dm-stripe.c465
-rw-r--r--drivers/md/dm-switch.c591
-rw-r--r--drivers/md/dm-sysfs.c144
-rw-r--r--drivers/md/dm-table.c1707
-rw-r--r--drivers/md/dm-target.c173
-rw-r--r--drivers/md/dm-thin-metadata.c1807
-rw-r--r--drivers/md/dm-thin-metadata.h221
-rw-r--r--drivers/md/dm-thin.c4107
-rw-r--r--drivers/md/dm-uevent.c219
-rw-r--r--drivers/md/dm-uevent.h59
-rw-r--r--drivers/md/dm-verity.c1026
-rw-r--r--drivers/md/dm-zero.c84
-rw-r--r--drivers/md/dm.c3650
-rw-r--r--drivers/md/dm.h244
-rw-r--r--drivers/md/faulty.c371
-rw-r--r--drivers/md/linear.c360
-rw-r--r--drivers/md/linear.h15
-rw-r--r--drivers/md/md-cluster.c965
-rw-r--r--drivers/md/md-cluster.h29
-rw-r--r--drivers/md/md.c9037
-rw-r--r--drivers/md/md.h696
-rw-r--r--drivers/md/multipath.c544
-rw-r--r--drivers/md/multipath.h31
-rw-r--r--drivers/md/persistent-data/Kconfig18
-rw-r--r--drivers/md/persistent-data/Makefile12
-rw-r--r--drivers/md/persistent-data/dm-array.c821
-rw-r--r--drivers/md/persistent-data/dm-array.h166
-rw-r--r--drivers/md/persistent-data/dm-bitset.c171
-rw-r--r--drivers/md/persistent-data/dm-bitset.h166
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c636
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h133
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h141
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c592
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c251
-rw-r--r--drivers/md/persistent-data/dm-btree.c892
-rw-r--r--drivers/md/persistent-data/dm-btree.h162
-rw-r--r--drivers/md/persistent-data/dm-persistent-data-internal.h19
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c751
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h127
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c305
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.h25
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c818
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h44
-rw-r--r--drivers/md/persistent-data/dm-space-map.h157
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c455
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h137
-rw-r--r--drivers/md/raid0.c749
-rw-r--r--drivers/md/raid0.h19
-rw-r--r--drivers/md/raid1.c3197
-rw-r--r--drivers/md/raid1.h173
-rw-r--r--drivers/md/raid10.c4731
-rw-r--r--drivers/md/raid10.h153
-rw-r--r--drivers/md/raid5.c7833
-rw-r--r--drivers/md/raid5.h607
133 files changed, 103234 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
new file mode 100644
index 000000000..edcf4ab66
--- /dev/null
+++ b/drivers/md/Kconfig
@@ -0,0 +1,478 @@
+#
+# Block device driver configuration
+#
+
+menuconfig MD
+ bool "Multiple devices driver support (RAID and LVM)"
+ depends on BLOCK
+ select SRCU
+ help
+ Support multiple physical spindles through a single logical device.
+ Required for RAID and logical volume management.
+
+if MD
+
+config BLK_DEV_MD
+ tristate "RAID support"
+ ---help---
+ This driver lets you combine several hard disk partitions into one
+ logical block device. This can be used to simply append one
+ partition to another one or to combine several redundant hard disks
+ into a RAID1/4/5 device so as to provide protection against hard
+ disk failures. This is called "Software RAID" since the combining of
+ the partitions is done by the kernel. "Hardware RAID" means that the
+ combining is done by a dedicated controller; if you have such a
+ controller, you do not need to say Y here.
+
+ More information about Software RAID on Linux is contained in the
+ Software RAID mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>. There you will also learn
+ where to get the supporting user space utilities raidtools.
+
+ If unsure, say N.
+
+config MD_AUTODETECT
+ bool "Autodetect RAID arrays during kernel boot"
+ depends on BLK_DEV_MD=y
+ default y
+ ---help---
+ If you say Y here, then the kernel will try to autodetect raid
+ arrays as part of its boot process.
+
+ If you don't use raid and say Y, this autodetection can cause
+ a several-second delay in the boot time due to various
+ synchronisation steps that are part of this step.
+
+ If unsure, say Y.
+
+config MD_LINEAR
+ tristate "Linear (append) mode"
+ depends on BLK_DEV_MD
+ ---help---
+ If you say Y here, then your multiple devices driver will be able to
+ use the so-called linear mode, i.e. it will combine the hard disk
+ partitions by simply appending one to the other.
+
+ To compile this as a module, choose M here: the module
+ will be called linear.
+
+ If unsure, say Y.
+
+config MD_RAID0
+ tristate "RAID-0 (striping) mode"
+ depends on BLK_DEV_MD
+ ---help---
+ If you say Y here, then your multiple devices driver will be able to
+ use the so-called raid0 mode, i.e. it will combine the hard disk
+ partitions into one logical device in such a fashion as to fill them
+ up evenly, one chunk here and one chunk there. This will increase
+ the throughput rate if the partitions reside on distinct disks.
+
+ Information about Software RAID on Linux is contained in the
+ Software-RAID mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>. There you will also
+ learn where to get the supporting user space utilities raidtools.
+
+ To compile this as a module, choose M here: the module
+ will be called raid0.
+
+ If unsure, say Y.
+
+config MD_RAID1
+ tristate "RAID-1 (mirroring) mode"
+ depends on BLK_DEV_MD
+ ---help---
+ A RAID-1 set consists of several disk drives which are exact copies
+ of each other. In the event of a mirror failure, the RAID driver
+ will continue to use the operational mirrors in the set, providing
+ an error free MD (multiple device) to the higher levels of the
+ kernel. In a set with N drives, the available space is the capacity
+ of a single drive, and the set protects against a failure of (N - 1)
+ drives.
+
+ Information about Software RAID on Linux is contained in the
+ Software-RAID mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>. There you will also
+ learn where to get the supporting user space utilities raidtools.
+
+ If you want to use such a RAID-1 set, say Y. To compile this code
+ as a module, choose M here: the module will be called raid1.
+
+ If unsure, say Y.
+
+config MD_RAID10
+ tristate "RAID-10 (mirrored striping) mode"
+ depends on BLK_DEV_MD
+ ---help---
+ RAID-10 provides a combination of striping (RAID-0) and
+ mirroring (RAID-1) with easier configuration and more flexible
+ layout.
+ Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
+ be the same size (or at least, only as much as the smallest device
+ will be used).
+ RAID-10 provides a variety of layouts that provide different levels
+ of redundancy and performance.
+
+ RAID-10 requires mdadm-1.7.0 or later, available at:
+
+ ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
+
+ If unsure, say Y.
+
+config MD_RAID456
+ tristate "RAID-4/RAID-5/RAID-6 mode"
+ depends on BLK_DEV_MD
+ select RAID6_PQ
+ select ASYNC_MEMCPY
+ select ASYNC_XOR
+ select ASYNC_PQ
+ select ASYNC_RAID6_RECOV
+ ---help---
+ A RAID-5 set of N drives with a capacity of C MB per drive provides
+ the capacity of C * (N - 1) MB, and protects against a failure
+ of a single drive. For a given sector (row) number, (N - 1) drives
+ contain data sectors, and one drive contains the parity protection.
+ For a RAID-4 set, the parity blocks are present on a single drive,
+ while a RAID-5 set distributes the parity across the drives in one
+ of the available parity distribution methods.
+
+ A RAID-6 set of N drives with a capacity of C MB per drive
+ provides the capacity of C * (N - 2) MB, and protects
+ against a failure of any two drives. For a given sector
+ (row) number, (N - 2) drives contain data sectors, and two
+ drives contains two independent redundancy syndromes. Like
+ RAID-5, RAID-6 distributes the syndromes across the drives
+ in one of the available parity distribution methods.
+
+ Information about Software RAID on Linux is contained in the
+ Software-RAID mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>. There you will also
+ learn where to get the supporting user space utilities raidtools.
+
+ If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To
+ compile this code as a module, choose M here: the module
+ will be called raid456.
+
+ If unsure, say Y.
+
+config MD_MULTIPATH
+ tristate "Multipath I/O support"
+ depends on BLK_DEV_MD
+ help
+ MD_MULTIPATH provides a simple multi-path personality for use
+ the MD framework. It is not under active development. New
+ projects should consider using DM_MULTIPATH which has more
+ features and more testing.
+
+ If unsure, say N.
+
+config MD_FAULTY
+ tristate "Faulty test module for MD"
+ depends on BLK_DEV_MD
+ help
+ The "faulty" module allows for a block device that occasionally returns
+ read or write errors. It is useful for testing.
+
+ In unsure, say N.
+
+
+config MD_CLUSTER
+ tristate "Cluster Support for MD (EXPERIMENTAL)"
+ depends on BLK_DEV_MD
+ depends on DLM
+ default n
+ ---help---
+ Clustering support for MD devices. This enables locking and
+ synchronization across multiple systems on the cluster, so all
+ nodes in the cluster can access the MD devices simultaneously.
+
+ This brings the redundancy (and uptime) of RAID levels across the
+ nodes of the cluster.
+
+ If unsure, say N.
+
+source "drivers/md/bcache/Kconfig"
+
+config BLK_DEV_DM_BUILTIN
+ bool
+
+config BLK_DEV_DM
+ tristate "Device mapper support"
+ select BLK_DEV_DM_BUILTIN
+ ---help---
+ Device-mapper is a low level volume manager. It works by allowing
+ people to specify mappings for ranges of logical sectors. Various
+ mapping types are available, in addition people may write their own
+ modules containing custom mappings if they wish.
+
+ Higher level volume managers such as LVM2 use this driver.
+
+ To compile this as a module, choose M here: the module will be
+ called dm-mod.
+
+ If unsure, say N.
+
+config DM_MQ_DEFAULT
+ bool "request-based DM: use blk-mq I/O path by default"
+ depends on BLK_DEV_DM
+ ---help---
+ This option enables the blk-mq based I/O path for request-based
+ DM devices by default. With the option the dm_mod.use_blk_mq
+ module/boot option defaults to Y, without it to N, but it can
+ still be overriden either way.
+
+ If unsure say N.
+
+config DM_DEBUG
+ bool "Device mapper debugging support"
+ depends on BLK_DEV_DM
+ ---help---
+ Enable this for messages that may help debug device-mapper problems.
+
+ If unsure, say N.
+
+config DM_BUFIO
+ tristate
+ depends on BLK_DEV_DM
+ ---help---
+ This interface allows you to do buffered I/O on a device and acts
+ as a cache, holding recently-read blocks in memory and performing
+ delayed writes.
+
+config DM_BIO_PRISON
+ tristate
+ depends on BLK_DEV_DM
+ ---help---
+ Some bio locking schemes used by other device-mapper targets
+ including thin provisioning.
+
+source "drivers/md/persistent-data/Kconfig"
+
+config DM_CRYPT
+ tristate "Crypt target support"
+ depends on BLK_DEV_DM
+ select CRYPTO
+ select CRYPTO_CBC
+ ---help---
+ This device-mapper target allows you to create a device that
+ transparently encrypts the data on it. You'll need to activate
+ the ciphers you're going to use in the cryptoapi configuration.
+
+ For further information on dm-crypt and userspace tools see:
+ <http://code.google.com/p/cryptsetup/wiki/DMCrypt>
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-crypt.
+
+ If unsure, say N.
+
+config DM_SNAPSHOT
+ tristate "Snapshot target"
+ depends on BLK_DEV_DM
+ select DM_BUFIO
+ ---help---
+ Allow volume managers to take writable snapshots of a device.
+
+config DM_THIN_PROVISIONING
+ tristate "Thin provisioning target"
+ depends on BLK_DEV_DM
+ select DM_PERSISTENT_DATA
+ select DM_BIO_PRISON
+ ---help---
+ Provides thin provisioning and snapshots that share a data store.
+
+config DM_CACHE
+ tristate "Cache target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ default n
+ select DM_PERSISTENT_DATA
+ select DM_BIO_PRISON
+ ---help---
+ dm-cache attempts to improve performance of a block device by
+ moving frequently used data to a smaller, higher performance
+ device. Different 'policy' plugins can be used to change the
+ algorithms used to select which blocks are promoted, demoted,
+ cleaned etc. It supports writeback and writethrough modes.
+
+config DM_CACHE_MQ
+ tristate "MQ Cache Policy (EXPERIMENTAL)"
+ depends on DM_CACHE
+ default y
+ ---help---
+ A cache policy that uses a multiqueue ordered by recent hit
+ count to select which blocks should be promoted and demoted.
+ This is meant to be a general purpose policy. It prioritises
+ reads over writes.
+
+config DM_CACHE_CLEANER
+ tristate "Cleaner Cache Policy (EXPERIMENTAL)"
+ depends on DM_CACHE
+ default y
+ ---help---
+ A simple cache policy that writes back all data to the
+ origin. Used when decommissioning a dm-cache.
+
+config DM_ERA
+ tristate "Era target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ default n
+ select DM_PERSISTENT_DATA
+ select DM_BIO_PRISON
+ ---help---
+ dm-era tracks which parts of a block device are written to
+ over time. Useful for maintaining cache coherency when using
+ vendor snapshots.
+
+config DM_MIRROR
+ tristate "Mirror target"
+ depends on BLK_DEV_DM
+ ---help---
+ Allow volume managers to mirror logical volumes, also
+ needed for live data migration tools such as 'pvmove'.
+
+config DM_LOG_USERSPACE
+ tristate "Mirror userspace logging"
+ depends on DM_MIRROR && NET
+ select CONNECTOR
+ ---help---
+ The userspace logging module provides a mechanism for
+ relaying the dm-dirty-log API to userspace. Log designs
+ which are more suited to userspace implementation (e.g.
+ shared storage logs) or experimental logs can be implemented
+ by leveraging this framework.
+
+config DM_RAID
+ tristate "RAID 1/4/5/6/10 target"
+ depends on BLK_DEV_DM
+ select MD_RAID1
+ select MD_RAID10
+ select MD_RAID456
+ select BLK_DEV_MD
+ ---help---
+ A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
+
+ A RAID-5 set of N drives with a capacity of C MB per drive provides
+ the capacity of C * (N - 1) MB, and protects against a failure
+ of a single drive. For a given sector (row) number, (N - 1) drives
+ contain data sectors, and one drive contains the parity protection.
+ For a RAID-4 set, the parity blocks are present on a single drive,
+ while a RAID-5 set distributes the parity across the drives in one
+ of the available parity distribution methods.
+
+ A RAID-6 set of N drives with a capacity of C MB per drive
+ provides the capacity of C * (N - 2) MB, and protects
+ against a failure of any two drives. For a given sector
+ (row) number, (N - 2) drives contain data sectors, and two
+ drives contains two independent redundancy syndromes. Like
+ RAID-5, RAID-6 distributes the syndromes across the drives
+ in one of the available parity distribution methods.
+
+config DM_ZERO
+ tristate "Zero target"
+ depends on BLK_DEV_DM
+ ---help---
+ A target that discards writes, and returns all zeroes for
+ reads. Useful in some recovery situations.
+
+config DM_MULTIPATH
+ tristate "Multipath target"
+ depends on BLK_DEV_DM
+ # nasty syntax but means make DM_MULTIPATH independent
+ # of SCSI_DH if the latter isn't defined but if
+ # it is, DM_MULTIPATH must depend on it. We get a build
+ # error if SCSI_DH=m and DM_MULTIPATH=y
+ depends on SCSI_DH || !SCSI_DH
+ ---help---
+ Allow volume managers to support multipath hardware.
+
+config DM_MULTIPATH_QL
+ tristate "I/O Path Selector based on the number of in-flight I/Os"
+ depends on DM_MULTIPATH
+ ---help---
+ This path selector is a dynamic load balancer which selects
+ the path with the least number of in-flight I/Os.
+
+ If unsure, say N.
+
+config DM_MULTIPATH_ST
+ tristate "I/O Path Selector based on the service time"
+ depends on DM_MULTIPATH
+ ---help---
+ This path selector is a dynamic load balancer which selects
+ the path expected to complete the incoming I/O in the shortest
+ time.
+
+ If unsure, say N.
+
+config DM_DELAY
+ tristate "I/O delaying target"
+ depends on BLK_DEV_DM
+ ---help---
+ A target that delays reads and/or writes and can send
+ them to different devices. Useful for testing.
+
+ If unsure, say N.
+
+config DM_UEVENT
+ bool "DM uevents"
+ depends on BLK_DEV_DM
+ ---help---
+ Generate udev events for DM events.
+
+config DM_FLAKEY
+ tristate "Flakey target"
+ depends on BLK_DEV_DM
+ ---help---
+ A target that intermittently fails I/O for debugging purposes.
+
+config DM_VERITY
+ tristate "Verity target support"
+ depends on BLK_DEV_DM
+ select CRYPTO
+ select CRYPTO_HASH
+ select DM_BUFIO
+ ---help---
+ This device-mapper target creates a read-only device that
+ transparently validates the data on one underlying device against
+ a pre-generated tree of cryptographic checksums stored on a second
+ device.
+
+ You'll need to activate the digests you're going to use in the
+ cryptoapi configuration.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-verity.
+
+ If unsure, say N.
+
+config DM_SWITCH
+ tristate "Switch target support (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ ---help---
+ This device-mapper target creates a device that supports an arbitrary
+ mapping of fixed-size regions of I/O across a fixed set of paths.
+ The path used for any specific region can be switched dynamically
+ by sending the target a message.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-switch.
+
+ If unsure, say N.
+
+config DM_LOG_WRITES
+ tristate "Log writes target support"
+ depends on BLK_DEV_DM
+ ---help---
+ This device-mapper target takes two devices, one device to use
+ normally, one to log all write operations done to the first device.
+ This is for use by file system developers wishing to verify that
+ their fs is writing a consitent file system at all times by allowing
+ them to replay the log in a variety of ways and to check the
+ contents.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-log-writes.
+
+ If unsure, say N.
+
+endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
new file mode 100644
index 000000000..dba4db598
--- /dev/null
+++ b/drivers/md/Makefile
@@ -0,0 +1,63 @@
+#
+# Makefile for the kernel software RAID and LVM drivers.
+#
+
+dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
+ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o
+dm-multipath-y += dm-path-selector.o dm-mpath.o
+dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
+ dm-snap-persistent.o
+dm-mirror-y += dm-raid1.o
+dm-log-userspace-y \
+ += dm-log-userspace-base.o dm-log-userspace-transfer.o
+dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
+dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-mq-y += dm-cache-policy-mq.o
+dm-cache-cleaner-y += dm-cache-policy-cleaner.o
+dm-era-y += dm-era-target.o
+md-mod-y += md.o bitmap.o
+raid456-y += raid5.o
+
+# Note: link order is important. All raid personalities
+# and must come before md.o, as they each initialise
+# themselves, and md.o may use the personalities when it
+# auto-initialised.
+
+obj-$(CONFIG_MD_LINEAR) += linear.o
+obj-$(CONFIG_MD_RAID0) += raid0.o
+obj-$(CONFIG_MD_RAID1) += raid1.o
+obj-$(CONFIG_MD_RAID10) += raid10.o
+obj-$(CONFIG_MD_RAID456) += raid456.o
+obj-$(CONFIG_MD_MULTIPATH) += multipath.o
+obj-$(CONFIG_MD_FAULTY) += faulty.o
+obj-$(CONFIG_MD_CLUSTER) += md-cluster.o
+obj-$(CONFIG_BCACHE) += bcache/
+obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
+obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
+obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
+obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
+obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o
+obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
+obj-$(CONFIG_DM_DELAY) += dm-delay.o
+obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
+obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
+obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
+obj-$(CONFIG_DM_SWITCH) += dm-switch.o
+obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
+obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
+obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
+obj-$(CONFIG_DM_ZERO) += dm-zero.o
+obj-$(CONFIG_DM_RAID) += dm-raid.o
+obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
+obj-$(CONFIG_DM_VERITY) += dm-verity.o
+obj-$(CONFIG_DM_CACHE) += dm-cache.o
+obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
+obj-$(CONFIG_DM_ERA) += dm-era.o
+obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
+
+ifeq ($(CONFIG_DM_UEVENT),y)
+dm-mod-objs += dm-uevent.o
+endif
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
new file mode 100644
index 000000000..4d200883c
--- /dev/null
+++ b/drivers/md/bcache/Kconfig
@@ -0,0 +1,26 @@
+
+config BCACHE
+ tristate "Block device as cache"
+ ---help---
+ Allows a block device to be used as cache for other devices; uses
+ a btree for indexing and the layout is optimized for SSDs.
+
+ See Documentation/bcache.txt for details.
+
+config BCACHE_DEBUG
+ bool "Bcache debugging"
+ depends on BCACHE
+ ---help---
+ Don't select this option unless you're a developer
+
+ Enables extra debugging tools, allows expensive runtime checks to be
+ turned on.
+
+config BCACHE_CLOSURES_DEBUG
+ bool "Debug closures"
+ depends on BCACHE
+ select DEBUG_FS
+ ---help---
+ Keeps all active closures in a linked list and provides a debugfs
+ interface to list them, which makes it possible to see asynchronous
+ operations that get stuck.
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
new file mode 100644
index 000000000..c488b846f
--- /dev/null
+++ b/drivers/md/bcache/Makefile
@@ -0,0 +1,8 @@
+
+obj-$(CONFIG_BCACHE) += bcache.o
+
+bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
+ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ util.o writeback.o
+
+CFLAGS_request.o += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
new file mode 100644
index 000000000..8eeab72b9
--- /dev/null
+++ b/drivers/md/bcache/alloc.c
@@ -0,0 +1,696 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * There is another freelist, because sometimes we have buckets that we know
+ * have nothing pointing into them - these we can reuse without waiting for
+ * priorities to be rewritten. These come from freed btree nodes and buckets
+ * that garbage collection discovered no longer had valid keys pointing into
+ * them (because they were overwritten). That's the unused list - buckets on the
+ * unused list move to the free list, optionally being discarded in the process.
+ *
+ * It's also important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch_bucket_alloc() allocates a single bucket from a specific cache.
+ *
+ * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * out of a cache set.
+ *
+ * free_some_buckets() drives all the processes described above. It's called
+ * from bch_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+
+#include <linux/blkdev.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+#include <trace/events/bcache.h>
+
+/* Bucket heap / gen */
+
+uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
+{
+ uint8_t ret = ++b->gen;
+
+ ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
+ WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
+
+ return ret;
+}
+
+void bch_rescale_priorities(struct cache_set *c, int sectors)
+{
+ struct cache *ca;
+ struct bucket *b;
+ unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
+ unsigned i;
+ int r;
+
+ atomic_sub(sectors, &c->rescale);
+
+ do {
+ r = atomic_read(&c->rescale);
+
+ if (r >= 0)
+ return;
+ } while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
+
+ mutex_lock(&c->bucket_lock);
+
+ c->min_prio = USHRT_MAX;
+
+ for_each_cache(ca, c, i)
+ for_each_bucket(b, ca)
+ if (b->prio &&
+ b->prio != BTREE_PRIO &&
+ !atomic_read(&b->pin)) {
+ b->prio--;
+ c->min_prio = min(c->min_prio, b->prio);
+ }
+
+ mutex_unlock(&c->bucket_lock);
+}
+
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
+
+static inline bool can_inc_bucket_gen(struct bucket *b)
+{
+ return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
+}
+
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
+{
+ BUG_ON(!ca->set->gc_mark_valid);
+
+ return (!GC_MARK(b) ||
+ GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
+ !atomic_read(&b->pin) &&
+ can_inc_bucket_gen(b);
+}
+
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+ lockdep_assert_held(&ca->set->bucket_lock);
+ BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
+
+ if (GC_SECTORS_USED(b))
+ trace_bcache_invalidate(ca, b - ca->buckets);
+
+ bch_inc_gen(ca, b);
+ b->prio = INITIAL_PRIO;
+ atomic_inc(&b->pin);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+ __bch_invalidate_one_bucket(ca, b);
+
+ fifo_push(&ca->free_inc, b - ca->buckets);
+}
+
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_prio()
+ * first: we also take into account the number of sectors of live data in that
+ * bucket, and in order for that multiply to make sense we have to scale bucket
+ *
+ * Thus, we scale the bucket priorities so that the bucket with the smallest
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+
+#define bucket_prio(b) \
+({ \
+ unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
+ \
+ (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
+})
+
+#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
+#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
+
+static void invalidate_buckets_lru(struct cache *ca)
+{
+ struct bucket *b;
+ ssize_t i;
+
+ ca->heap.used = 0;
+
+ for_each_bucket(b, ca) {
+ if (!bch_can_invalidate_bucket(ca, b))
+ continue;
+
+ if (!heap_full(&ca->heap))
+ heap_add(&ca->heap, b, bucket_max_cmp);
+ else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
+ ca->heap.data[0] = b;
+ heap_sift(&ca->heap, 0, bucket_max_cmp);
+ }
+ }
+
+ for (i = ca->heap.used / 2 - 1; i >= 0; --i)
+ heap_sift(&ca->heap, i, bucket_min_cmp);
+
+ while (!fifo_full(&ca->free_inc)) {
+ if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
+ /*
+ * We don't want to be calling invalidate_buckets()
+ * multiple times when it can't do anything
+ */
+ ca->invalidate_needs_gc = 1;
+ wake_up_gc(ca->set);
+ return;
+ }
+
+ bch_invalidate_one_bucket(ca, b);
+ }
+}
+
+static void invalidate_buckets_fifo(struct cache *ca)
+{
+ struct bucket *b;
+ size_t checked = 0;
+
+ while (!fifo_full(&ca->free_inc)) {
+ if (ca->fifo_last_bucket < ca->sb.first_bucket ||
+ ca->fifo_last_bucket >= ca->sb.nbuckets)
+ ca->fifo_last_bucket = ca->sb.first_bucket;
+
+ b = ca->buckets + ca->fifo_last_bucket++;
+
+ if (bch_can_invalidate_bucket(ca, b))
+ bch_invalidate_one_bucket(ca, b);
+
+ if (++checked >= ca->sb.nbuckets) {
+ ca->invalidate_needs_gc = 1;
+ wake_up_gc(ca->set);
+ return;
+ }
+ }
+}
+
+static void invalidate_buckets_random(struct cache *ca)
+{
+ struct bucket *b;
+ size_t checked = 0;
+
+ while (!fifo_full(&ca->free_inc)) {
+ size_t n;
+ get_random_bytes(&n, sizeof(n));
+
+ n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
+ n += ca->sb.first_bucket;
+
+ b = ca->buckets + n;
+
+ if (bch_can_invalidate_bucket(ca, b))
+ bch_invalidate_one_bucket(ca, b);
+
+ if (++checked >= ca->sb.nbuckets / 2) {
+ ca->invalidate_needs_gc = 1;
+ wake_up_gc(ca->set);
+ return;
+ }
+ }
+}
+
+static void invalidate_buckets(struct cache *ca)
+{
+ BUG_ON(ca->invalidate_needs_gc);
+
+ switch (CACHE_REPLACEMENT(&ca->sb)) {
+ case CACHE_REPLACEMENT_LRU:
+ invalidate_buckets_lru(ca);
+ break;
+ case CACHE_REPLACEMENT_FIFO:
+ invalidate_buckets_fifo(ca);
+ break;
+ case CACHE_REPLACEMENT_RANDOM:
+ invalidate_buckets_random(ca);
+ break;
+ }
+}
+
+#define allocator_wait(ca, cond) \
+do { \
+ while (1) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (cond) \
+ break; \
+ \
+ mutex_unlock(&(ca)->set->bucket_lock); \
+ if (kthread_should_stop()) \
+ return 0; \
+ \
+ try_to_freeze(); \
+ schedule(); \
+ mutex_lock(&(ca)->set->bucket_lock); \
+ } \
+ __set_current_state(TASK_RUNNING); \
+} while (0)
+
+static int bch_allocator_push(struct cache *ca, long bucket)
+{
+ unsigned i;
+
+ /* Prios/gens are actually the most important reserve */
+ if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+ return true;
+
+ for (i = 0; i < RESERVE_NR; i++)
+ if (fifo_push(&ca->free[i], bucket))
+ return true;
+
+ return false;
+}
+
+static int bch_allocator_thread(void *arg)
+{
+ struct cache *ca = arg;
+
+ mutex_lock(&ca->set->bucket_lock);
+
+ while (1) {
+ /*
+ * First, we pull buckets off of the unused and free_inc lists,
+ * possibly issue discards to them, then we add the bucket to
+ * the free list:
+ */
+ while (!fifo_empty(&ca->free_inc)) {
+ long bucket;
+
+ fifo_pop(&ca->free_inc, bucket);
+
+ if (ca->discard) {
+ mutex_unlock(&ca->set->bucket_lock);
+ blkdev_issue_discard(ca->bdev,
+ bucket_to_sector(ca->set, bucket),
+ ca->sb.bucket_size, GFP_KERNEL, 0);
+ mutex_lock(&ca->set->bucket_lock);
+ }
+
+ allocator_wait(ca, bch_allocator_push(ca, bucket));
+ wake_up(&ca->set->btree_cache_wait);
+ wake_up(&ca->set->bucket_wait);
+ }
+
+ /*
+ * We've run out of free buckets, we need to find some buckets
+ * we can invalidate. First, invalidate them in memory and add
+ * them to the free_inc list:
+ */
+
+retry_invalidate:
+ allocator_wait(ca, ca->set->gc_mark_valid &&
+ !ca->invalidate_needs_gc);
+ invalidate_buckets(ca);
+
+ /*
+ * Now, we write their new gens to disk so we can start writing
+ * new stuff to them:
+ */
+ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
+ if (CACHE_SYNC(&ca->set->sb)) {
+ /*
+ * This could deadlock if an allocation with a btree
+ * node locked ever blocked - having the btree node
+ * locked would block garbage collection, but here we're
+ * waiting on garbage collection before we invalidate
+ * and free anything.
+ *
+ * But this should be safe since the btree code always
+ * uses btree_check_reserve() before allocating now, and
+ * if it fails it blocks without btree nodes locked.
+ */
+ if (!fifo_full(&ca->free_inc))
+ goto retry_invalidate;
+
+ bch_prio_write(ca);
+ }
+ }
+}
+
+/* Allocation */
+
+long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
+{
+ DEFINE_WAIT(w);
+ struct bucket *b;
+ long r;
+
+ /* fastpath */
+ if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
+ fifo_pop(&ca->free[reserve], r))
+ goto out;
+
+ if (!wait) {
+ trace_bcache_alloc_fail(ca, reserve);
+ return -1;
+ }
+
+ do {
+ prepare_to_wait(&ca->set->bucket_wait, &w,
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_unlock(&ca->set->bucket_lock);
+ schedule();
+ mutex_lock(&ca->set->bucket_lock);
+ } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
+ !fifo_pop(&ca->free[reserve], r));
+
+ finish_wait(&ca->set->bucket_wait, &w);
+out:
+ wake_up_process(ca->alloc_thread);
+
+ trace_bcache_alloc(ca, reserve);
+
+ if (expensive_debug_checks(ca->set)) {
+ size_t iter;
+ long i;
+ unsigned j;
+
+ for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
+ BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
+
+ for (j = 0; j < RESERVE_NR; j++)
+ fifo_for_each(i, &ca->free[j], iter)
+ BUG_ON(i == r);
+ fifo_for_each(i, &ca->free_inc, iter)
+ BUG_ON(i == r);
+ }
+
+ b = ca->buckets + r;
+
+ BUG_ON(atomic_read(&b->pin) != 1);
+
+ SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
+
+ if (reserve <= RESERVE_PRIO) {
+ SET_GC_MARK(b, GC_MARK_METADATA);
+ SET_GC_MOVE(b, 0);
+ b->prio = BTREE_PRIO;
+ } else {
+ SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+ SET_GC_MOVE(b, 0);
+ b->prio = INITIAL_PRIO;
+ }
+
+ return r;
+}
+
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+ SET_GC_MARK(b, 0);
+ SET_GC_SECTORS_USED(b, 0);
+}
+
+void bch_bucket_free(struct cache_set *c, struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ __bch_bucket_free(PTR_CACHE(c, k, i),
+ PTR_BUCKET(c, k, i));
+}
+
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
+ struct bkey *k, int n, bool wait)
+{
+ int i;
+
+ lockdep_assert_held(&c->bucket_lock);
+ BUG_ON(!n || n > c->caches_loaded || n > 8);
+
+ bkey_init(k);
+
+ /* sort by free space/prio of oldest data in caches */
+
+ for (i = 0; i < n; i++) {
+ struct cache *ca = c->cache_by_alloc[i];
+ long b = bch_bucket_alloc(ca, reserve, wait);
+
+ if (b == -1)
+ goto err;
+
+ k->ptr[i] = PTR(ca->buckets[b].gen,
+ bucket_to_sector(c, b),
+ ca->sb.nr_this_dev);
+
+ SET_KEY_PTRS(k, i + 1);
+ }
+
+ return 0;
+err:
+ bch_bucket_free(c, k);
+ bkey_put(c, k);
+ return -1;
+}
+
+int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
+ struct bkey *k, int n, bool wait)
+{
+ int ret;
+ mutex_lock(&c->bucket_lock);
+ ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
+ mutex_unlock(&c->bucket_lock);
+ return ret;
+}
+
+/* Sector allocator */
+
+struct open_bucket {
+ struct list_head list;
+ unsigned last_write_point;
+ unsigned sectors_free;
+ BKEY_PADDED(key);
+};
+
+/*
+ * We keep multiple buckets open for writes, and try to segregate different
+ * write streams for better cache utilization: first we look for a bucket where
+ * the last write to it was sequential with the current write, and failing that
+ * we look for a bucket that was last used by the same task.
+ *
+ * The ideas is if you've got multiple tasks pulling data into the cache at the
+ * same time, you'll get better cache utilization if you try to segregate their
+ * data and preserve locality.
+ *
+ * For example, say you've starting Firefox at the same time you're copying a
+ * bunch of files. Firefox will likely end up being fairly hot and stay in the
+ * cache awhile, but the data you copied might not be; if you wrote all that
+ * data to the same buckets it'd get invalidated at the same time.
+ *
+ * Both of those tasks will be doing fairly random IO so we can't rely on
+ * detecting sequential IO to segregate their data, but going off of the task
+ * should be a sane heuristic.
+ */
+static struct open_bucket *pick_data_bucket(struct cache_set *c,
+ const struct bkey *search,
+ unsigned write_point,
+ struct bkey *alloc)
+{
+ struct open_bucket *ret, *ret_task = NULL;
+
+ list_for_each_entry_reverse(ret, &c->data_buckets, list)
+ if (!bkey_cmp(&ret->key, search))
+ goto found;
+ else if (ret->last_write_point == write_point)
+ ret_task = ret;
+
+ ret = ret_task ?: list_first_entry(&c->data_buckets,
+ struct open_bucket, list);
+found:
+ if (!ret->sectors_free && KEY_PTRS(alloc)) {
+ ret->sectors_free = c->sb.bucket_size;
+ bkey_copy(&ret->key, alloc);
+ bkey_init(alloc);
+ }
+
+ if (!ret->sectors_free)
+ ret = NULL;
+
+ return ret;
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
+ * sectors were actually allocated.
+ *
+ * If s->writeback is true, will not fail.
+ */
+bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
+ unsigned write_point, unsigned write_prio, bool wait)
+{
+ struct open_bucket *b;
+ BKEY_PADDED(key) alloc;
+ unsigned i;
+
+ /*
+ * We might have to allocate a new bucket, which we can't do with a
+ * spinlock held. So if we have to allocate, we drop the lock, allocate
+ * and then retry. KEY_PTRS() indicates whether alloc points to
+ * allocated bucket(s).
+ */
+
+ bkey_init(&alloc.key);
+ spin_lock(&c->data_bucket_lock);
+
+ while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
+ unsigned watermark = write_prio
+ ? RESERVE_MOVINGGC
+ : RESERVE_NONE;
+
+ spin_unlock(&c->data_bucket_lock);
+
+ if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
+ return false;
+
+ spin_lock(&c->data_bucket_lock);
+ }
+
+ /*
+ * If we had to allocate, we might race and not need to allocate the
+ * second time we call find_data_bucket(). If we allocated a bucket but
+ * didn't use it, drop the refcount bch_bucket_alloc_set() took:
+ */
+ if (KEY_PTRS(&alloc.key))
+ bkey_put(c, &alloc.key);
+
+ for (i = 0; i < KEY_PTRS(&b->key); i++)
+ EBUG_ON(ptr_stale(c, &b->key, i));
+
+ /* Set up the pointer to the space we're allocating: */
+
+ for (i = 0; i < KEY_PTRS(&b->key); i++)
+ k->ptr[i] = b->key.ptr[i];
+
+ sectors = min(sectors, b->sectors_free);
+
+ SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
+ SET_KEY_SIZE(k, sectors);
+ SET_KEY_PTRS(k, KEY_PTRS(&b->key));
+
+ /*
+ * Move b to the end of the lru, and keep track of what this bucket was
+ * last used for:
+ */
+ list_move_tail(&b->list, &c->data_buckets);
+ bkey_copy_key(&b->key, k);
+ b->last_write_point = write_point;
+
+ b->sectors_free -= sectors;
+
+ for (i = 0; i < KEY_PTRS(&b->key); i++) {
+ SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
+
+ atomic_long_add(sectors,
+ &PTR_CACHE(c, &b->key, i)->sectors_written);
+ }
+
+ if (b->sectors_free < c->sb.block_size)
+ b->sectors_free = 0;
+
+ /*
+ * k takes refcounts on the buckets it points to until it's inserted
+ * into the btree, but if we're done with this bucket we just transfer
+ * get_data_bucket()'s refcount.
+ */
+ if (b->sectors_free)
+ for (i = 0; i < KEY_PTRS(&b->key); i++)
+ atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
+
+ spin_unlock(&c->data_bucket_lock);
+ return true;
+}
+
+/* Init */
+
+void bch_open_buckets_free(struct cache_set *c)
+{
+ struct open_bucket *b;
+
+ while (!list_empty(&c->data_buckets)) {
+ b = list_first_entry(&c->data_buckets,
+ struct open_bucket, list);
+ list_del(&b->list);
+ kfree(b);
+ }
+}
+
+int bch_open_buckets_alloc(struct cache_set *c)
+{
+ int i;
+
+ spin_lock_init(&c->data_bucket_lock);
+
+ for (i = 0; i < 6; i++) {
+ struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ list_add(&b->list, &c->data_buckets);
+ }
+
+ return 0;
+}
+
+int bch_cache_allocator_start(struct cache *ca)
+{
+ struct task_struct *k = kthread_run(bch_allocator_thread,
+ ca, "bcache_allocator");
+ if (IS_ERR(k))
+ return PTR_ERR(k);
+
+ ca->alloc_thread = k;
+ return 0;
+}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
new file mode 100644
index 000000000..04f7bc28e
--- /dev/null
+++ b/drivers/md/bcache/bcache.h
@@ -0,0 +1,946 @@
+#ifndef _BCACHE_H
+#define _BCACHE_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into. Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/bcache.h>
+#include <linux/bio.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "bset.h"
+#include "util.h"
+#include "closure.h"
+
+struct bucket {
+ atomic_t pin;
+ uint16_t prio;
+ uint8_t gen;
+ uint8_t last_gc; /* Most out of date gen in the btree */
+ uint16_t gc_mark; /* Bitfield used by GC. See below for field */
+};
+
+/*
+ * I'd use bitfields for these, but I don't trust the compiler not to screw me
+ * as multiple threads touch struct bucket without locking
+ */
+
+BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
+#define GC_MARK_RECLAIMABLE 1
+#define GC_MARK_DIRTY 2
+#define GC_MARK_METADATA 3
+#define GC_SECTORS_USED_SIZE 13
+#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
+BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
+BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
+
+#include "journal.h"
+#include "stats.h"
+struct search;
+struct btree;
+struct keybuf;
+
+struct keybuf_key {
+ struct rb_node node;
+ BKEY_PADDED(key);
+ void *private;
+};
+
+struct keybuf {
+ struct bkey last_scanned;
+ spinlock_t lock;
+
+ /*
+ * Beginning and end of range in rb tree - so that we can skip taking
+ * lock and checking the rb tree when we need to check for overlapping
+ * keys.
+ */
+ struct bkey start;
+ struct bkey end;
+
+ struct rb_root keys;
+
+#define KEYBUF_NR 500
+ DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
+};
+
+struct bio_split_pool {
+ struct bio_set *bio_split;
+ mempool_t *bio_split_hook;
+};
+
+struct bio_split_hook {
+ struct closure cl;
+ struct bio_split_pool *p;
+ struct bio *bio;
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+};
+
+struct bcache_device {
+ struct closure cl;
+
+ struct kobject kobj;
+
+ struct cache_set *c;
+ unsigned id;
+#define BCACHEDEVNAME_SIZE 12
+ char name[BCACHEDEVNAME_SIZE];
+
+ struct gendisk *disk;
+
+ unsigned long flags;
+#define BCACHE_DEV_CLOSING 0
+#define BCACHE_DEV_DETACHING 1
+#define BCACHE_DEV_UNLINK_DONE 2
+
+ unsigned nr_stripes;
+ unsigned stripe_size;
+ atomic_t *stripe_sectors_dirty;
+ unsigned long *full_dirty_stripes;
+
+ unsigned long sectors_dirty_last;
+ long sectors_dirty_derivative;
+
+ struct bio_set *bio_split;
+
+ unsigned data_csum:1;
+
+ int (*cache_miss)(struct btree *, struct search *,
+ struct bio *, unsigned);
+ int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
+
+ struct bio_split_pool bio_split_hook;
+};
+
+struct io {
+ /* Used to track sequential IO so it can be skipped */
+ struct hlist_node hash;
+ struct list_head lru;
+
+ unsigned long jiffies;
+ unsigned sequential;
+ sector_t last;
+};
+
+struct cached_dev {
+ struct list_head list;
+ struct bcache_device disk;
+ struct block_device *bdev;
+
+ struct cache_sb sb;
+ struct bio sb_bio;
+ struct bio_vec sb_bv[1];
+ struct closure sb_write;
+ struct semaphore sb_write_mutex;
+
+ /* Refcount on the cache set. Always nonzero when we're caching. */
+ atomic_t count;
+ struct work_struct detach;
+
+ /*
+ * Device might not be running if it's dirty and the cache set hasn't
+ * showed up yet.
+ */
+ atomic_t running;
+
+ /*
+ * Writes take a shared lock from start to finish; scanning for dirty
+ * data to refill the rb tree requires an exclusive lock.
+ */
+ struct rw_semaphore writeback_lock;
+
+ /*
+ * Nonzero, and writeback has a refcount (d->count), iff there is dirty
+ * data in the cache. Protected by writeback_lock; must have an
+ * shared lock to set and exclusive lock to clear.
+ */
+ atomic_t has_dirty;
+
+ struct bch_ratelimit writeback_rate;
+ struct delayed_work writeback_rate_update;
+
+ /*
+ * Internal to the writeback code, so read_dirty() can keep track of
+ * where it's at.
+ */
+ sector_t last_read;
+
+ /* Limit number of writeback bios in flight */
+ struct semaphore in_flight;
+ struct task_struct *writeback_thread;
+
+ struct keybuf writeback_keys;
+
+ /* For tracking sequential IO */
+#define RECENT_IO_BITS 7
+#define RECENT_IO (1 << RECENT_IO_BITS)
+ struct io io[RECENT_IO];
+ struct hlist_head io_hash[RECENT_IO + 1];
+ struct list_head io_lru;
+ spinlock_t io_lock;
+
+ struct cache_accounting accounting;
+
+ /* The rest of this all shows up in sysfs */
+ unsigned sequential_cutoff;
+ unsigned readahead;
+
+ unsigned verify:1;
+ unsigned bypass_torture_test:1;
+
+ unsigned partial_stripes_expensive:1;
+ unsigned writeback_metadata:1;
+ unsigned writeback_running:1;
+ unsigned char writeback_percent;
+ unsigned writeback_delay;
+
+ uint64_t writeback_rate_target;
+ int64_t writeback_rate_proportional;
+ int64_t writeback_rate_derivative;
+ int64_t writeback_rate_change;
+
+ unsigned writeback_rate_update_seconds;
+ unsigned writeback_rate_d_term;
+ unsigned writeback_rate_p_term_inverse;
+};
+
+enum alloc_reserve {
+ RESERVE_BTREE,
+ RESERVE_PRIO,
+ RESERVE_MOVINGGC,
+ RESERVE_NONE,
+ RESERVE_NR,
+};
+
+struct cache {
+ struct cache_set *set;
+ struct cache_sb sb;
+ struct bio sb_bio;
+ struct bio_vec sb_bv[1];
+
+ struct kobject kobj;
+ struct block_device *bdev;
+
+ struct task_struct *alloc_thread;
+
+ struct closure prio;
+ struct prio_set *disk_buckets;
+
+ /*
+ * When allocating new buckets, prio_write() gets first dibs - since we
+ * may not be allocate at all without writing priorities and gens.
+ * prio_buckets[] contains the last buckets we wrote priorities to (so
+ * gc can mark them as metadata), prio_next[] contains the buckets
+ * allocated for the next prio write.
+ */
+ uint64_t *prio_buckets;
+ uint64_t *prio_last_buckets;
+
+ /*
+ * free: Buckets that are ready to be used
+ *
+ * free_inc: Incoming buckets - these are buckets that currently have
+ * cached data in them, and we can't reuse them until after we write
+ * their new gen to disk. After prio_write() finishes writing the new
+ * gens/prios, they'll be moved to the free list (and possibly discarded
+ * in the process)
+ */
+ DECLARE_FIFO(long, free)[RESERVE_NR];
+ DECLARE_FIFO(long, free_inc);
+
+ size_t fifo_last_bucket;
+
+ /* Allocation stuff: */
+ struct bucket *buckets;
+
+ DECLARE_HEAP(struct bucket *, heap);
+
+ /*
+ * If nonzero, we know we aren't going to find any buckets to invalidate
+ * until a gc finishes - otherwise we could pointlessly burn a ton of
+ * cpu
+ */
+ unsigned invalidate_needs_gc:1;
+
+ bool discard; /* Get rid of? */
+
+ struct journal_device journal;
+
+ /* The rest of this all shows up in sysfs */
+#define IO_ERROR_SHIFT 20
+ atomic_t io_errors;
+ atomic_t io_count;
+
+ atomic_long_t meta_sectors_written;
+ atomic_long_t btree_sectors_written;
+ atomic_long_t sectors_written;
+
+ struct bio_split_pool bio_split_hook;
+};
+
+struct gc_stat {
+ size_t nodes;
+ size_t key_bytes;
+
+ size_t nkeys;
+ uint64_t data; /* sectors */
+ unsigned in_use; /* percent */
+};
+
+/*
+ * Flag bits, for how the cache set is shutting down, and what phase it's at:
+ *
+ * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ *
+ * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
+ * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
+ * flushing dirty data).
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
+ */
+#define CACHE_SET_UNREGISTERING 0
+#define CACHE_SET_STOPPING 1
+#define CACHE_SET_RUNNING 2
+
+struct cache_set {
+ struct closure cl;
+
+ struct list_head list;
+ struct kobject kobj;
+ struct kobject internal;
+ struct dentry *debug;
+ struct cache_accounting accounting;
+
+ unsigned long flags;
+
+ struct cache_sb sb;
+
+ struct cache *cache[MAX_CACHES_PER_SET];
+ struct cache *cache_by_alloc[MAX_CACHES_PER_SET];
+ int caches_loaded;
+
+ struct bcache_device **devices;
+ struct list_head cached_devs;
+ uint64_t cached_dev_sectors;
+ struct closure caching;
+
+ struct closure sb_write;
+ struct semaphore sb_write_mutex;
+
+ mempool_t *search;
+ mempool_t *bio_meta;
+ struct bio_set *bio_split;
+
+ /* For the btree cache */
+ struct shrinker shrink;
+
+ /* For the btree cache and anything allocation related */
+ struct mutex bucket_lock;
+
+ /* log2(bucket_size), in sectors */
+ unsigned short bucket_bits;
+
+ /* log2(block_size), in sectors */
+ unsigned short block_bits;
+
+ /*
+ * Default number of pages for a new btree node - may be less than a
+ * full bucket
+ */
+ unsigned btree_pages;
+
+ /*
+ * Lists of struct btrees; lru is the list for structs that have memory
+ * allocated for actual btree node, freed is for structs that do not.
+ *
+ * We never free a struct btree, except on shutdown - we just put it on
+ * the btree_cache_freed list and reuse it later. This simplifies the
+ * code, and it doesn't cost us much memory as the memory usage is
+ * dominated by buffers that hold the actual btree node data and those
+ * can be freed - and the number of struct btrees allocated is
+ * effectively bounded.
+ *
+ * btree_cache_freeable effectively is a small cache - we use it because
+ * high order page allocations can be rather expensive, and it's quite
+ * common to delete and allocate btree nodes in quick succession. It
+ * should never grow past ~2-3 nodes in practice.
+ */
+ struct list_head btree_cache;
+ struct list_head btree_cache_freeable;
+ struct list_head btree_cache_freed;
+
+ /* Number of elements in btree_cache + btree_cache_freeable lists */
+ unsigned btree_cache_used;
+
+ /*
+ * If we need to allocate memory for a new btree node and that
+ * allocation fails, we can cannibalize another node in the btree cache
+ * to satisfy the allocation - lock to guarantee only one thread does
+ * this at a time:
+ */
+ wait_queue_head_t btree_cache_wait;
+ struct task_struct *btree_cache_alloc_lock;
+
+ /*
+ * When we free a btree node, we increment the gen of the bucket the
+ * node is in - but we can't rewrite the prios and gens until we
+ * finished whatever it is we were doing, otherwise after a crash the
+ * btree node would be freed but for say a split, we might not have the
+ * pointers to the new nodes inserted into the btree yet.
+ *
+ * This is a refcount that blocks prio_write() until the new keys are
+ * written.
+ */
+ atomic_t prio_blocked;
+ wait_queue_head_t bucket_wait;
+
+ /*
+ * For any bio we don't skip we subtract the number of sectors from
+ * rescale; when it hits 0 we rescale all the bucket priorities.
+ */
+ atomic_t rescale;
+ /*
+ * When we invalidate buckets, we use both the priority and the amount
+ * of good data to determine which buckets to reuse first - to weight
+ * those together consistently we keep track of the smallest nonzero
+ * priority of any bucket.
+ */
+ uint16_t min_prio;
+
+ /*
+ * max(gen - last_gc) for all buckets. When it gets too big we have to gc
+ * to keep gens from wrapping around.
+ */
+ uint8_t need_gc;
+ struct gc_stat gc_stats;
+ size_t nbuckets;
+
+ struct task_struct *gc_thread;
+ /* Where in the btree gc currently is */
+ struct bkey gc_done;
+
+ /*
+ * The allocation code needs gc_mark in struct bucket to be correct, but
+ * it's not while a gc is in progress. Protected by bucket_lock.
+ */
+ int gc_mark_valid;
+
+ /* Counts how many sectors bio_insert has added to the cache */
+ atomic_t sectors_to_gc;
+
+ wait_queue_head_t moving_gc_wait;
+ struct keybuf moving_gc_keys;
+ /* Number of moving GC bios in flight */
+ struct semaphore moving_in_flight;
+
+ struct workqueue_struct *moving_gc_wq;
+
+ struct btree *root;
+
+#ifdef CONFIG_BCACHE_DEBUG
+ struct btree *verify_data;
+ struct bset *verify_ondisk;
+ struct mutex verify_lock;
+#endif
+
+ unsigned nr_uuids;
+ struct uuid_entry *uuids;
+ BKEY_PADDED(uuid_bucket);
+ struct closure uuid_write;
+ struct semaphore uuid_write_mutex;
+
+ /*
+ * A btree node on disk could have too many bsets for an iterator to fit
+ * on the stack - have to dynamically allocate them
+ */
+ mempool_t *fill_iter;
+
+ struct bset_sort_state sort;
+
+ /* List of buckets we're currently writing data to */
+ struct list_head data_buckets;
+ spinlock_t data_bucket_lock;
+
+ struct journal journal;
+
+#define CONGESTED_MAX 1024
+ unsigned congested_last_us;
+ atomic_t congested;
+
+ /* The rest of this all shows up in sysfs */
+ unsigned congested_read_threshold_us;
+ unsigned congested_write_threshold_us;
+
+ struct time_stats btree_gc_time;
+ struct time_stats btree_split_time;
+ struct time_stats btree_read_time;
+
+ atomic_long_t cache_read_races;
+ atomic_long_t writeback_keys_done;
+ atomic_long_t writeback_keys_failed;
+
+ enum {
+ ON_ERROR_UNREGISTER,
+ ON_ERROR_PANIC,
+ } on_error;
+ unsigned error_limit;
+ unsigned error_decay;
+
+ unsigned short journal_delay_ms;
+ bool expensive_debug_checks;
+ unsigned verify:1;
+ unsigned key_merging_disabled:1;
+ unsigned gc_always_rewrite:1;
+ unsigned shrinker_disabled:1;
+ unsigned copy_gc_enabled:1;
+
+#define BUCKET_HASH_BITS 12
+ struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
+};
+
+struct bbio {
+ unsigned submit_time_us;
+ union {
+ struct bkey key;
+ uint64_t _pad[3];
+ /*
+ * We only need pad = 3 here because we only ever carry around a
+ * single pointer - i.e. the pointer we're doing io to/from.
+ */
+ };
+ struct bio bio;
+};
+
+#define BTREE_PRIO USHRT_MAX
+#define INITIAL_PRIO 32768U
+
+#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
+#define btree_blocks(b) \
+ ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
+
+#define btree_default_blocks(c) \
+ ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
+
+#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS)
+#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
+#define block_bytes(c) ((c)->sb.block_size << 9)
+
+#define prios_per_bucket(c) \
+ ((bucket_bytes(c) - sizeof(struct prio_set)) / \
+ sizeof(struct bucket_disk))
+#define prio_buckets(c) \
+ DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
+
+static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
+{
+ return s >> c->bucket_bits;
+}
+
+static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
+{
+ return ((sector_t) b) << c->bucket_bits;
+}
+
+static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
+{
+ return s & (c->sb.bucket_size - 1);
+}
+
+static inline struct cache *PTR_CACHE(struct cache_set *c,
+ const struct bkey *k,
+ unsigned ptr)
+{
+ return c->cache[PTR_DEV(k, ptr)];
+}
+
+static inline size_t PTR_BUCKET_NR(struct cache_set *c,
+ const struct bkey *k,
+ unsigned ptr)
+{
+ return sector_to_bucket(c, PTR_OFFSET(k, ptr));
+}
+
+static inline struct bucket *PTR_BUCKET(struct cache_set *c,
+ const struct bkey *k,
+ unsigned ptr)
+{
+ return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
+}
+
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+ uint8_t r = a - b;
+ return r > 128U ? 0 : r;
+}
+
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+ unsigned i)
+{
+ return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
+}
+
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+ unsigned i)
+{
+ return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+/* Btree key macros */
+
+/*
+ * This is used for various on disk data structures - cache_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first 8 bytes of these structs
+ */
+#define csum_set(i) \
+ bch_crc64(((void *) (i)) + sizeof(uint64_t), \
+ ((void *) bset_bkey_last(i)) - \
+ (((void *) (i)) + sizeof(uint64_t)))
+
+/* Error handling macros */
+
+#define btree_bug(b, ...) \
+do { \
+ if (bch_cache_set_error((b)->c, __VA_ARGS__)) \
+ dump_stack(); \
+} while (0)
+
+#define cache_bug(c, ...) \
+do { \
+ if (bch_cache_set_error(c, __VA_ARGS__)) \
+ dump_stack(); \
+} while (0)
+
+#define btree_bug_on(cond, b, ...) \
+do { \
+ if (cond) \
+ btree_bug(b, __VA_ARGS__); \
+} while (0)
+
+#define cache_bug_on(cond, c, ...) \
+do { \
+ if (cond) \
+ cache_bug(c, __VA_ARGS__); \
+} while (0)
+
+#define cache_set_err_on(cond, c, ...) \
+do { \
+ if (cond) \
+ bch_cache_set_error(c, __VA_ARGS__); \
+} while (0)
+
+/* Looping macros */
+
+#define for_each_cache(ca, cs, iter) \
+ for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++)
+
+#define for_each_bucket(b, ca) \
+ for (b = (ca)->buckets + (ca)->sb.first_bucket; \
+ b < (ca)->buckets + (ca)->sb.nbuckets; b++)
+
+static inline void cached_dev_put(struct cached_dev *dc)
+{
+ if (atomic_dec_and_test(&dc->count))
+ schedule_work(&dc->detach);
+}
+
+static inline bool cached_dev_get(struct cached_dev *dc)
+{
+ if (!atomic_inc_not_zero(&dc->count))
+ return false;
+
+ /* Paired with the mb in cached_dev_attach */
+ smp_mb__after_atomic();
+ return true;
+}
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree (last_gc).
+ */
+
+static inline uint8_t bucket_gc_gen(struct bucket *b)
+{
+ return b->gen - b->last_gc;
+}
+
+#define BUCKET_GC_GEN_MAX 96U
+
+#define kobj_attribute_write(n, fn) \
+ static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
+
+#define kobj_attribute_rw(n, show, store) \
+ static struct kobj_attribute ksysfs_##n = \
+ __ATTR(n, S_IWUSR|S_IRUSR, show, store)
+
+static inline void wake_up_allocators(struct cache_set *c)
+{
+ struct cache *ca;
+ unsigned i;
+
+ for_each_cache(ca, c, i)
+ wake_up_process(ca->alloc_thread);
+}
+
+/* Forward declarations */
+
+void bch_count_io_errors(struct cache *, int, const char *);
+void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
+ int, const char *);
+void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
+void bch_bbio_free(struct bio *, struct cache_set *);
+struct bio *bch_bbio_alloc(struct cache_set *);
+
+void bch_generic_make_request(struct bio *, struct bio_split_pool *);
+void __bch_submit_bbio(struct bio *, struct cache_set *);
+void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
+
+uint8_t bch_inc_gen(struct cache *, struct bucket *);
+void bch_rescale_priorities(struct cache_set *, int);
+
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
+void bch_bucket_free(struct cache_set *, struct bkey *);
+
+long bch_bucket_alloc(struct cache *, unsigned, bool);
+int __bch_bucket_alloc_set(struct cache_set *, unsigned,
+ struct bkey *, int, bool);
+int bch_bucket_alloc_set(struct cache_set *, unsigned,
+ struct bkey *, int, bool);
+bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
+ unsigned, unsigned, bool);
+
+__printf(2, 3)
+bool bch_cache_set_error(struct cache_set *, const char *, ...);
+
+void bch_prio_write(struct cache *);
+void bch_write_bdev_super(struct cached_dev *, struct closure *);
+
+extern struct workqueue_struct *bcache_wq;
+extern const char * const bch_cache_modes[];
+extern struct mutex bch_register_lock;
+extern struct list_head bch_cache_sets;
+
+extern struct kobj_type bch_cached_dev_ktype;
+extern struct kobj_type bch_flash_dev_ktype;
+extern struct kobj_type bch_cache_set_ktype;
+extern struct kobj_type bch_cache_set_internal_ktype;
+extern struct kobj_type bch_cache_ktype;
+
+void bch_cached_dev_release(struct kobject *);
+void bch_flash_dev_release(struct kobject *);
+void bch_cache_set_release(struct kobject *);
+void bch_cache_release(struct kobject *);
+
+int bch_uuid_write(struct cache_set *);
+void bcache_write_super(struct cache_set *);
+
+int bch_flash_dev_create(struct cache_set *c, uint64_t size);
+
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+void bch_cached_dev_detach(struct cached_dev *);
+void bch_cached_dev_run(struct cached_dev *);
+void bcache_device_stop(struct bcache_device *);
+
+void bch_cache_set_unregister(struct cache_set *);
+void bch_cache_set_stop(struct cache_set *);
+
+struct cache_set *bch_cache_set_alloc(struct cache_sb *);
+void bch_btree_cache_free(struct cache_set *);
+int bch_btree_cache_alloc(struct cache_set *);
+void bch_moving_init_cache_set(struct cache_set *);
+int bch_open_buckets_alloc(struct cache_set *);
+void bch_open_buckets_free(struct cache_set *);
+
+int bch_cache_allocator_start(struct cache *ca);
+
+void bch_debug_exit(void);
+int bch_debug_init(struct kobject *);
+void bch_request_exit(void);
+int bch_request_init(void);
+
+#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
new file mode 100644
index 000000000..646fe8526
--- /dev/null
+++ b/drivers/md/bcache/bset.c
@@ -0,0 +1,1331 @@
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include "util.h"
+#include "bset.h"
+
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
+{
+ struct bkey *k, *next;
+
+ for (k = i->start; k < bset_bkey_last(i); k = next) {
+ next = bkey_next(k);
+
+ printk(KERN_ERR "block %u key %u/%u: ", set,
+ (unsigned) ((u64 *) k - i->d), i->keys);
+
+ if (b->ops->key_dump)
+ b->ops->key_dump(b, k);
+ else
+ printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
+
+ if (next < bset_bkey_last(i) &&
+ bkey_cmp(k, b->ops->is_extents ?
+ &START_KEY(next) : next) > 0)
+ printk(KERN_ERR "Key skipped backwards\n");
+ }
+}
+
+void bch_dump_bucket(struct btree_keys *b)
+{
+ unsigned i;
+
+ console_lock();
+ for (i = 0; i <= b->nsets; i++)
+ bch_dump_bset(b, b->set[i].data,
+ bset_sector_offset(b, b->set[i].data));
+ console_unlock();
+}
+
+int __bch_count_data(struct btree_keys *b)
+{
+ unsigned ret = 0;
+ struct btree_iter iter;
+ struct bkey *k;
+
+ if (b->ops->is_extents)
+ for_each_key(b, k, &iter)
+ ret += KEY_SIZE(k);
+ return ret;
+}
+
+void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
+{
+ va_list args;
+ struct bkey *k, *p = NULL;
+ struct btree_iter iter;
+ const char *err;
+
+ for_each_key(b, k, &iter) {
+ if (b->ops->is_extents) {
+ err = "Keys out of order";
+ if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+ goto bug;
+
+ if (bch_ptr_invalid(b, k))
+ continue;
+
+ err = "Overlapping keys";
+ if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+ goto bug;
+ } else {
+ if (bch_ptr_bad(b, k))
+ continue;
+
+ err = "Duplicate keys";
+ if (p && !bkey_cmp(p, k))
+ goto bug;
+ }
+ p = k;
+ }
+#if 0
+ err = "Key larger than btree node key";
+ if (p && bkey_cmp(p, &b->key) > 0)
+ goto bug;
+#endif
+ return;
+bug:
+ bch_dump_bucket(b);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ panic("bch_check_keys error: %s:\n", err);
+}
+
+static void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+ struct bkey *k = iter->data->k, *next = bkey_next(k);
+
+ if (next < iter->data->end &&
+ bkey_cmp(k, iter->b->ops->is_extents ?
+ &START_KEY(next) : next) > 0) {
+ bch_dump_bucket(iter->b);
+ panic("Key skipped backwards\n");
+ }
+}
+
+#else
+
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+
+#endif
+
+/* Keylists */
+
+int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
+{
+ size_t oldsize = bch_keylist_nkeys(l);
+ size_t newsize = oldsize + u64s;
+ uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
+ uint64_t *new_keys;
+
+ newsize = roundup_pow_of_two(newsize);
+
+ if (newsize <= KEYLIST_INLINE ||
+ roundup_pow_of_two(oldsize) == newsize)
+ return 0;
+
+ new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO);
+
+ if (!new_keys)
+ return -ENOMEM;
+
+ if (!old_keys)
+ memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize);
+
+ l->keys_p = new_keys;
+ l->top_p = new_keys + oldsize;
+
+ return 0;
+}
+
+struct bkey *bch_keylist_pop(struct keylist *l)
+{
+ struct bkey *k = l->keys;
+
+ if (k == l->top)
+ return NULL;
+
+ while (bkey_next(k) != l->top)
+ k = bkey_next(k);
+
+ return l->top = k;
+}
+
+void bch_keylist_pop_front(struct keylist *l)
+{
+ l->top_p -= bkey_u64s(l->keys);
+
+ memmove(l->keys,
+ bkey_next(l->keys),
+ bch_keylist_bytes(l));
+}
+
+/* Key/pointer manipulation */
+
+void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
+ unsigned i)
+{
+ BUG_ON(i > KEY_PTRS(src));
+
+ /* Only copy the header, key, and one pointer. */
+ memcpy(dest, src, 2 * sizeof(uint64_t));
+ dest->ptr[0] = src->ptr[i];
+ SET_KEY_PTRS(dest, 1);
+ /* We didn't copy the checksum so clear that bit. */
+ SET_KEY_CSUM(dest, 0);
+}
+
+bool __bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+ unsigned i, len = 0;
+
+ if (bkey_cmp(where, &START_KEY(k)) <= 0)
+ return false;
+
+ if (bkey_cmp(where, k) < 0)
+ len = KEY_OFFSET(k) - KEY_OFFSET(where);
+ else
+ bkey_copy_key(k, where);
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
+
+ BUG_ON(len > KEY_SIZE(k));
+ SET_KEY_SIZE(k, len);
+ return true;
+}
+
+bool __bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+ unsigned len = 0;
+
+ if (bkey_cmp(where, k) >= 0)
+ return false;
+
+ BUG_ON(KEY_INODE(where) != KEY_INODE(k));
+
+ if (bkey_cmp(where, &START_KEY(k)) > 0)
+ len = KEY_OFFSET(where) - KEY_START(k);
+
+ bkey_copy_key(k, where);
+
+ BUG_ON(len > KEY_SIZE(k));
+ SET_KEY_SIZE(k, len);
+ return true;
+}
+
+/* Auxiliary search trees */
+
+/* 32 bits total: */
+#define BKEY_MID_BITS 3
+#define BKEY_EXPONENT_BITS 7
+#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
+#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+ unsigned exponent:BKEY_EXPONENT_BITS;
+ unsigned m:BKEY_MID_BITS;
+ unsigned mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE 128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree_keys *b)
+{
+ return PAGE_SIZE << b->page_order;
+}
+
+static inline size_t btree_keys_cachelines(struct btree_keys *b)
+{
+ return btree_keys_bytes(b) / BSET_CACHELINE;
+}
+
+/* Space required for the auxiliary search trees */
+static inline size_t bset_tree_bytes(struct btree_keys *b)
+{
+ return btree_keys_cachelines(b) * sizeof(struct bkey_float);
+}
+
+/* Space required for the prev pointers */
+static inline size_t bset_prev_bytes(struct btree_keys *b)
+{
+ return btree_keys_cachelines(b) * sizeof(uint8_t);
+}
+
+/* Memory allocation */
+
+void bch_btree_keys_free(struct btree_keys *b)
+{
+ struct bset_tree *t = b->set;
+
+ if (bset_prev_bytes(b) < PAGE_SIZE)
+ kfree(t->prev);
+ else
+ free_pages((unsigned long) t->prev,
+ get_order(bset_prev_bytes(b)));
+
+ if (bset_tree_bytes(b) < PAGE_SIZE)
+ kfree(t->tree);
+ else
+ free_pages((unsigned long) t->tree,
+ get_order(bset_tree_bytes(b)));
+
+ free_pages((unsigned long) t->data, b->page_order);
+
+ t->prev = NULL;
+ t->tree = NULL;
+ t->data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_free);
+
+int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
+{
+ struct bset_tree *t = b->set;
+
+ BUG_ON(t->data);
+
+ b->page_order = page_order;
+
+ t->data = (void *) __get_free_pages(gfp, b->page_order);
+ if (!t->data)
+ goto err;
+
+ t->tree = bset_tree_bytes(b) < PAGE_SIZE
+ ? kmalloc(bset_tree_bytes(b), gfp)
+ : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+ if (!t->tree)
+ goto err;
+
+ t->prev = bset_prev_bytes(b) < PAGE_SIZE
+ ? kmalloc(bset_prev_bytes(b), gfp)
+ : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+ if (!t->prev)
+ goto err;
+
+ return 0;
+err:
+ bch_btree_keys_free(b);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(bch_btree_keys_alloc);
+
+void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
+ bool *expensive_debug_checks)
+{
+ unsigned i;
+
+ b->ops = ops;
+ b->expensive_debug_checks = expensive_debug_checks;
+ b->nsets = 0;
+ b->last_set_unwritten = 0;
+
+ /* XXX: shouldn't be needed */
+ for (i = 0; i < MAX_BSETS; i++)
+ b->set[i].size = 0;
+ /*
+ * Second loop starts at 1 because b->keys[0]->data is the memory we
+ * allocated
+ */
+ for (i = 1; i < MAX_BSETS; i++)
+ b->set[i].data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_init);
+
+/* Binary tree stuff for auxiliary search trees */
+
+static unsigned inorder_next(unsigned j, unsigned size)
+{
+ if (j * 2 + 1 < size) {
+ j = j * 2 + 1;
+
+ while (j * 2 < size)
+ j *= 2;
+ } else
+ j >>= ffz(j) + 1;
+
+ return j;
+}
+
+static unsigned inorder_prev(unsigned j, unsigned size)
+{
+ if (j * 2 < size) {
+ j = j * 2;
+
+ while (j * 2 + 1 < size)
+ j = j * 2 + 1;
+ } else
+ j >>= ffs(j);
+
+ return j;
+}
+
+/* I have no idea why this code works... and I'm the one who wrote it
+ *
+ * However, I do know what it does:
+ * Given a binary tree constructed in an array (i.e. how you normally implement
+ * a heap), it converts a node in the tree - referenced by array index - to the
+ * index it would have if you did an inorder traversal.
+ *
+ * Also tested for every j, size up to size somewhere around 6 million.
+ *
+ * The binary tree starts at array index 1, not 0
+ * extra is a function of size:
+ * extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+ */
+static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
+{
+ unsigned b = fls(j);
+ unsigned shift = fls(size - 1) - b;
+
+ j ^= 1U << (b - 1);
+ j <<= 1;
+ j |= 1;
+ j <<= shift;
+
+ if (j > extra)
+ j -= (j - extra) >> 1;
+
+ return j;
+}
+
+static unsigned to_inorder(unsigned j, struct bset_tree *t)
+{
+ return __to_inorder(j, t->size, t->extra);
+}
+
+static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
+{
+ unsigned shift;
+
+ if (j > extra)
+ j += j - extra;
+
+ shift = ffs(j);
+
+ j >>= shift;
+ j |= roundup_pow_of_two(size) >> shift;
+
+ return j;
+}
+
+static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
+{
+ return __inorder_to_tree(j, t->size, t->extra);
+}
+
+#if 0
+void inorder_test(void)
+{
+ unsigned long done = 0;
+ ktime_t start = ktime_get();
+
+ for (unsigned size = 2;
+ size < 65536000;
+ size++) {
+ unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+ unsigned i = 1, j = rounddown_pow_of_two(size - 1);
+
+ if (!(size % 4096))
+ printk(KERN_NOTICE "loop %u, %llu per us\n", size,
+ done / ktime_us_delta(ktime_get(), start));
+
+ while (1) {
+ if (__inorder_to_tree(i, size, extra) != j)
+ panic("size %10u j %10u i %10u", size, j, i);
+
+ if (__to_inorder(j, size, extra) != i)
+ panic("size %10u j %10u i %10u", size, j, i);
+
+ if (j == rounddown_pow_of_two(size) - 1)
+ break;
+
+ BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
+
+ j = inorder_next(j, size);
+ i++;
+ }
+
+ done += size - 1;
+ }
+}
+#endif
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; to_inorder() gives us the cacheline, and then
+ * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
+ unsigned offset)
+{
+ return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
+{
+ return ((void *) k - (void *) t->data) / BSET_CACHELINE;
+}
+
+static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
+ unsigned cacheline,
+ struct bkey *k)
+{
+ return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
+}
+
+static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
+{
+ return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
+}
+
+static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
+{
+ return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
+{
+ return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
+}
+
+static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
+{
+ low >>= shift;
+ low |= (high << 1) << (63U - shift);
+ return low;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey *k,
+ struct bkey_float *f)
+{
+ const uint64_t *p = &k->low - (f->exponent >> 6);
+ return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
+}
+
+static void make_bfloat(struct bset_tree *t, unsigned j)
+{
+ struct bkey_float *f = &t->tree[j];
+ struct bkey *m = tree_to_bkey(t, j);
+ struct bkey *p = tree_to_prev_bkey(t, j);
+
+ struct bkey *l = is_power_of_2(j)
+ ? t->data->start
+ : tree_to_prev_bkey(t, j >> ffs(j));
+
+ struct bkey *r = is_power_of_2(j + 1)
+ ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
+ : tree_to_bkey(t, j >> (ffz(j) + 1));
+
+ BUG_ON(m < l || m > r);
+ BUG_ON(bkey_next(p) != m);
+
+ if (KEY_INODE(l) != KEY_INODE(r))
+ f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
+ else
+ f->exponent = fls64(r->low ^ l->low);
+
+ f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
+
+ /*
+ * Setting f->exponent = 127 flags this node as failed, and causes the
+ * lookup code to fall back to comparing against the original key.
+ */
+
+ if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
+ f->mantissa = bfloat_mantissa(m, f) - 1;
+ else
+ f->exponent = 127;
+}
+
+static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
+{
+ if (t != b->set) {
+ unsigned j = roundup(t[-1].size,
+ 64 / sizeof(struct bkey_float));
+
+ t->tree = t[-1].tree + j;
+ t->prev = t[-1].prev + j;
+ }
+
+ while (t < b->set + MAX_BSETS)
+ t++->size = 0;
+}
+
+static void bch_bset_build_unwritten_tree(struct btree_keys *b)
+{
+ struct bset_tree *t = bset_tree_last(b);
+
+ BUG_ON(b->last_set_unwritten);
+ b->last_set_unwritten = 1;
+
+ bset_alloc_tree(b, t);
+
+ if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
+ t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
+ t->size = 1;
+ }
+}
+
+void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
+{
+ if (i != b->set->data) {
+ b->set[++b->nsets].data = i;
+ i->seq = b->set->data->seq;
+ } else
+ get_random_bytes(&i->seq, sizeof(uint64_t));
+
+ i->magic = magic;
+ i->version = 0;
+ i->keys = 0;
+
+ bch_bset_build_unwritten_tree(b);
+}
+EXPORT_SYMBOL(bch_bset_init_next);
+
+void bch_bset_build_written_tree(struct btree_keys *b)
+{
+ struct bset_tree *t = bset_tree_last(b);
+ struct bkey *prev = NULL, *k = t->data->start;
+ unsigned j, cacheline = 1;
+
+ b->last_set_unwritten = 0;
+
+ bset_alloc_tree(b, t);
+
+ t->size = min_t(unsigned,
+ bkey_to_cacheline(t, bset_bkey_last(t->data)),
+ b->set->tree + btree_keys_cachelines(b) - t->tree);
+
+ if (t->size < 2) {
+ t->size = 0;
+ return;
+ }
+
+ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+ /* First we figure out where the first key in each cacheline is */
+ for (j = inorder_next(0, t->size);
+ j;
+ j = inorder_next(j, t->size)) {
+ while (bkey_to_cacheline(t, k) < cacheline)
+ prev = k, k = bkey_next(k);
+
+ t->prev[j] = bkey_u64s(prev);
+ t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
+ }
+
+ while (bkey_next(k) != bset_bkey_last(t->data))
+ k = bkey_next(k);
+
+ t->end = *k;
+
+ /* Then we build the tree */
+ for (j = inorder_next(0, t->size);
+ j;
+ j = inorder_next(j, t->size))
+ make_bfloat(t, j);
+}
+EXPORT_SYMBOL(bch_bset_build_written_tree);
+
+/* Insert */
+
+void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
+{
+ struct bset_tree *t;
+ unsigned inorder, j = 1;
+
+ for (t = b->set; t <= bset_tree_last(b); t++)
+ if (k < bset_bkey_last(t->data))
+ goto found_set;
+
+ BUG();
+found_set:
+ if (!t->size || !bset_written(b, t))
+ return;
+
+ inorder = bkey_to_cacheline(t, k);
+
+ if (k == t->data->start)
+ goto fix_left;
+
+ if (bkey_next(k) == bset_bkey_last(t->data)) {
+ t->end = *k;
+ goto fix_right;
+ }
+
+ j = inorder_to_tree(inorder, t);
+
+ if (j &&
+ j < t->size &&
+ k == tree_to_bkey(t, j))
+fix_left: do {
+ make_bfloat(t, j);
+ j = j * 2;
+ } while (j < t->size);
+
+ j = inorder_to_tree(inorder + 1, t);
+
+ if (j &&
+ j < t->size &&
+ k == tree_to_prev_bkey(t, j))
+fix_right: do {
+ make_bfloat(t, j);
+ j = j * 2 + 1;
+ } while (j < t->size);
+}
+EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
+
+static void bch_bset_fix_lookup_table(struct btree_keys *b,
+ struct bset_tree *t,
+ struct bkey *k)
+{
+ unsigned shift = bkey_u64s(k);
+ unsigned j = bkey_to_cacheline(t, k);
+
+ /* We're getting called from btree_split() or btree_gc, just bail out */
+ if (!t->size)
+ return;
+
+ /* k is the key we just inserted; we need to find the entry in the
+ * lookup table for the first key that is strictly greater than k:
+ * it's either k's cacheline or the next one
+ */
+ while (j < t->size &&
+ table_to_bkey(t, j) <= k)
+ j++;
+
+ /* Adjust all the lookup table entries, and find a new key for any that
+ * have gotten too big
+ */
+ for (; j < t->size; j++) {
+ t->prev[j] += shift;
+
+ if (t->prev[j] > 7) {
+ k = table_to_bkey(t, j - 1);
+
+ while (k < cacheline_to_bkey(t, j, 0))
+ k = bkey_next(k);
+
+ t->prev[j] = bkey_to_cacheline_offset(t, j, k);
+ }
+ }
+
+ if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
+ return;
+
+ /* Possibly add a new entry to the end of the lookup table */
+
+ for (k = table_to_bkey(t, t->size - 1);
+ k != bset_bkey_last(t->data);
+ k = bkey_next(k))
+ if (t->size == bkey_to_cacheline(t, k)) {
+ t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
+ t->size++;
+ }
+}
+
+/*
+ * Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
+{
+ if (!b->ops->key_merge)
+ return false;
+
+ /*
+ * Generic header checks
+ * Assumes left and right are in order
+ * Left and right must be exactly aligned
+ */
+ if (!bch_bkey_equal_header(l, r) ||
+ bkey_cmp(l, &START_KEY(r)))
+ return false;
+
+ return b->ops->key_merge(b, l, r);
+}
+EXPORT_SYMBOL(bch_bkey_try_merge);
+
+void bch_bset_insert(struct btree_keys *b, struct bkey *where,
+ struct bkey *insert)
+{
+ struct bset_tree *t = bset_tree_last(b);
+
+ BUG_ON(!b->last_set_unwritten);
+ BUG_ON(bset_byte_offset(b, t->data) +
+ __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
+ PAGE_SIZE << b->page_order);
+
+ memmove((uint64_t *) where + bkey_u64s(insert),
+ where,
+ (void *) bset_bkey_last(t->data) - (void *) where);
+
+ t->data->keys += bkey_u64s(insert);
+ bkey_copy(where, insert);
+ bch_bset_fix_lookup_table(b, t, where);
+}
+EXPORT_SYMBOL(bch_bset_insert);
+
+unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
+ struct bkey *replace_key)
+{
+ unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
+ struct bset *i = bset_tree_last(b)->data;
+ struct bkey *m, *prev = NULL;
+ struct btree_iter iter;
+
+ BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
+
+ m = bch_btree_iter_init(b, &iter, b->ops->is_extents
+ ? PRECEDING_KEY(&START_KEY(k))
+ : PRECEDING_KEY(k));
+
+ if (b->ops->insert_fixup(b, k, &iter, replace_key))
+ return status;
+
+ status = BTREE_INSERT_STATUS_INSERT;
+
+ while (m != bset_bkey_last(i) &&
+ bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
+ prev = m, m = bkey_next(m);
+
+ /* prev is in the tree, if we merge we're done */
+ status = BTREE_INSERT_STATUS_BACK_MERGE;
+ if (prev &&
+ bch_bkey_try_merge(b, prev, k))
+ goto merged;
+#if 0
+ status = BTREE_INSERT_STATUS_OVERWROTE;
+ if (m != bset_bkey_last(i) &&
+ KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+ goto copy;
+#endif
+ status = BTREE_INSERT_STATUS_FRONT_MERGE;
+ if (m != bset_bkey_last(i) &&
+ bch_bkey_try_merge(b, k, m))
+ goto copy;
+
+ bch_bset_insert(b, m, k);
+copy: bkey_copy(m, k);
+merged:
+ return status;
+}
+EXPORT_SYMBOL(bch_btree_insert_key);
+
+/* Lookup */
+
+struct bset_search_iter {
+ struct bkey *l, *r;
+};
+
+static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
+ const struct bkey *search)
+{
+ unsigned li = 0, ri = t->size;
+
+ while (li + 1 != ri) {
+ unsigned m = (li + ri) >> 1;
+
+ if (bkey_cmp(table_to_bkey(t, m), search) > 0)
+ ri = m;
+ else
+ li = m;
+ }
+
+ return (struct bset_search_iter) {
+ table_to_bkey(t, li),
+ ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
+ };
+}
+
+static struct bset_search_iter bset_search_tree(struct bset_tree *t,
+ const struct bkey *search)
+{
+ struct bkey *l, *r;
+ struct bkey_float *f;
+ unsigned inorder, j, n = 1;
+
+ do {
+ unsigned p = n << 4;
+ p &= ((int) (p - t->size)) >> 31;
+
+ prefetch(&t->tree[p]);
+
+ j = n;
+ f = &t->tree[j];
+
+ /*
+ * n = (f->mantissa > bfloat_mantissa())
+ * ? j * 2
+ * : j * 2 + 1;
+ *
+ * We need to subtract 1 from f->mantissa for the sign bit trick
+ * to work - that's done in make_bfloat()
+ */
+ if (likely(f->exponent != 127))
+ n = j * 2 + (((unsigned)
+ (f->mantissa -
+ bfloat_mantissa(search, f))) >> 31);
+ else
+ n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+ ? j * 2
+ : j * 2 + 1;
+ } while (n < t->size);
+
+ inorder = to_inorder(j, t);
+
+ /*
+ * n would have been the node we recursed to - the low bit tells us if
+ * we recursed left or recursed right.
+ */
+ if (n & 1) {
+ l = cacheline_to_bkey(t, inorder, f->m);
+
+ if (++inorder != t->size) {
+ f = &t->tree[inorder_next(j, t->size)];
+ r = cacheline_to_bkey(t, inorder, f->m);
+ } else
+ r = bset_bkey_last(t->data);
+ } else {
+ r = cacheline_to_bkey(t, inorder, f->m);
+
+ if (--inorder) {
+ f = &t->tree[inorder_prev(j, t->size)];
+ l = cacheline_to_bkey(t, inorder, f->m);
+ } else
+ l = t->data->start;
+ }
+
+ return (struct bset_search_iter) {l, r};
+}
+
+struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
+ const struct bkey *search)
+{
+ struct bset_search_iter i;
+
+ /*
+ * First, we search for a cacheline, then lastly we do a linear search
+ * within that cacheline.
+ *
+ * To search for the cacheline, there's three different possibilities:
+ * * The set is too small to have a search tree, so we just do a linear
+ * search over the whole set.
+ * * The set is the one we're currently inserting into; keeping a full
+ * auxiliary search tree up to date would be too expensive, so we
+ * use a much simpler lookup table to do a binary search -
+ * bset_search_write_set().
+ * * Or we use the auxiliary search tree we constructed earlier -
+ * bset_search_tree()
+ */
+
+ if (unlikely(!t->size)) {
+ i.l = t->data->start;
+ i.r = bset_bkey_last(t->data);
+ } else if (bset_written(b, t)) {
+ /*
+ * Each node in the auxiliary search tree covers a certain range
+ * of bits, and keys above and below the set it covers might
+ * differ outside those bits - so we have to special case the
+ * start and end - handle that here:
+ */
+
+ if (unlikely(bkey_cmp(search, &t->end) >= 0))
+ return bset_bkey_last(t->data);
+
+ if (unlikely(bkey_cmp(search, t->data->start) < 0))
+ return t->data->start;
+
+ i = bset_search_tree(t, search);
+ } else {
+ BUG_ON(!b->nsets &&
+ t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
+
+ i = bset_search_write_set(t, search);
+ }
+
+ if (btree_keys_expensive_checks(b)) {
+ BUG_ON(bset_written(b, t) &&
+ i.l != t->data->start &&
+ bkey_cmp(tree_to_prev_bkey(t,
+ inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
+ search) > 0);
+
+ BUG_ON(i.r != bset_bkey_last(t->data) &&
+ bkey_cmp(i.r, search) <= 0);
+ }
+
+ while (likely(i.l != i.r) &&
+ bkey_cmp(i.l, search) <= 0)
+ i.l = bkey_next(i.l);
+
+ return i.l;
+}
+EXPORT_SYMBOL(__bch_bset_search);
+
+/* Btree iterator */
+
+typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
+ struct btree_iter_set);
+
+static inline bool btree_iter_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
+{
+ return bkey_cmp(l.k, r.k) > 0;
+}
+
+static inline bool btree_iter_end(struct btree_iter *iter)
+{
+ return !iter->used;
+}
+
+void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
+ struct bkey *end)
+{
+ if (k != end)
+ BUG_ON(!heap_add(iter,
+ ((struct btree_iter_set) { k, end }),
+ btree_iter_cmp));
+}
+
+static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
+ struct btree_iter *iter,
+ struct bkey *search,
+ struct bset_tree *start)
+{
+ struct bkey *ret = NULL;
+ iter->size = ARRAY_SIZE(iter->data);
+ iter->used = 0;
+
+#ifdef CONFIG_BCACHE_DEBUG
+ iter->b = b;
+#endif
+
+ for (; start <= bset_tree_last(b); start++) {
+ ret = bch_bset_search(b, start, search);
+ bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
+ }
+
+ return ret;
+}
+
+struct bkey *bch_btree_iter_init(struct btree_keys *b,
+ struct btree_iter *iter,
+ struct bkey *search)
+{
+ return __bch_btree_iter_init(b, iter, search, b->set);
+}
+EXPORT_SYMBOL(bch_btree_iter_init);
+
+static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
+ btree_iter_cmp_fn *cmp)
+{
+ struct btree_iter_set unused;
+ struct bkey *ret = NULL;
+
+ if (!btree_iter_end(iter)) {
+ bch_btree_iter_next_check(iter);
+
+ ret = iter->data->k;
+ iter->data->k = bkey_next(iter->data->k);
+
+ if (iter->data->k > iter->data->end) {
+ WARN_ONCE(1, "bset was corrupt!\n");
+ iter->data->k = iter->data->end;
+ }
+
+ if (iter->data->k == iter->data->end)
+ heap_pop(iter, unused, cmp);
+ else
+ heap_sift(iter, 0, cmp);
+ }
+
+ return ret;
+}
+
+struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+{
+ return __bch_btree_iter_next(iter, btree_iter_cmp);
+
+}
+EXPORT_SYMBOL(bch_btree_iter_next);
+
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
+ struct btree_keys *b, ptr_filter_fn fn)
+{
+ struct bkey *ret;
+
+ do {
+ ret = bch_btree_iter_next(iter);
+ } while (ret && fn(b, ret));
+
+ return ret;
+}
+
+/* Mergesort */
+
+void bch_bset_sort_state_free(struct bset_sort_state *state)
+{
+ if (state->pool)
+ mempool_destroy(state->pool);
+}
+
+int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
+{
+ spin_lock_init(&state->time.lock);
+
+ state->page_order = page_order;
+ state->crit_factor = int_sqrt(1 << page_order);
+
+ state->pool = mempool_create_page_pool(1, page_order);
+ if (!state->pool)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(bch_bset_sort_state_init);
+
+static void btree_mergesort(struct btree_keys *b, struct bset *out,
+ struct btree_iter *iter,
+ bool fixup, bool remove_stale)
+{
+ int i;
+ struct bkey *k, *last = NULL;
+ BKEY_PADDED(k) tmp;
+ bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
+ ? bch_ptr_bad
+ : bch_ptr_invalid;
+
+ /* Heapify the iterator, using our comparison function */
+ for (i = iter->used / 2 - 1; i >= 0; --i)
+ heap_sift(iter, i, b->ops->sort_cmp);
+
+ while (!btree_iter_end(iter)) {
+ if (b->ops->sort_fixup && fixup)
+ k = b->ops->sort_fixup(iter, &tmp.k);
+ else
+ k = NULL;
+
+ if (!k)
+ k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
+
+ if (bad(b, k))
+ continue;
+
+ if (!last) {
+ last = out->start;
+ bkey_copy(last, k);
+ } else if (!bch_bkey_try_merge(b, last, k)) {
+ last = bkey_next(last);
+ bkey_copy(last, k);
+ }
+ }
+
+ out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
+
+ pr_debug("sorted %i keys", out->keys);
+}
+
+static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
+ unsigned start, unsigned order, bool fixup,
+ struct bset_sort_state *state)
+{
+ uint64_t start_time;
+ bool used_mempool = false;
+ struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT,
+ order);
+ if (!out) {
+ struct page *outp;
+
+ BUG_ON(order > state->page_order);
+
+ outp = mempool_alloc(state->pool, GFP_NOIO);
+ out = page_address(outp);
+ used_mempool = true;
+ order = state->page_order;
+ }
+
+ start_time = local_clock();
+
+ btree_mergesort(b, out, iter, fixup, false);
+ b->nsets = start;
+
+ if (!start && order == b->page_order) {
+ /*
+ * Our temporary buffer is the same size as the btree node's
+ * buffer, we can just swap buffers instead of doing a big
+ * memcpy()
+ */
+
+ out->magic = b->set->data->magic;
+ out->seq = b->set->data->seq;
+ out->version = b->set->data->version;
+ swap(out, b->set->data);
+ } else {
+ b->set[start].data->keys = out->keys;
+ memcpy(b->set[start].data->start, out->start,
+ (void *) bset_bkey_last(out) - (void *) out->start);
+ }
+
+ if (used_mempool)
+ mempool_free(virt_to_page(out), state->pool);
+ else
+ free_pages((unsigned long) out, order);
+
+ bch_bset_build_written_tree(b);
+
+ if (!start)
+ bch_time_stats_update(&state->time, start_time);
+}
+
+void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
+ struct bset_sort_state *state)
+{
+ size_t order = b->page_order, keys = 0;
+ struct btree_iter iter;
+ int oldsize = bch_count_data(b);
+
+ __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
+
+ if (start) {
+ unsigned i;
+
+ for (i = start; i <= b->nsets; i++)
+ keys += b->set[i].data->keys;
+
+ order = get_order(__set_bytes(b->set->data, keys));
+ }
+
+ __btree_sort(b, &iter, start, order, false, state);
+
+ EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
+}
+EXPORT_SYMBOL(bch_btree_sort_partial);
+
+void bch_btree_sort_and_fix_extents(struct btree_keys *b,
+ struct btree_iter *iter,
+ struct bset_sort_state *state)
+{
+ __btree_sort(b, iter, 0, b->page_order, true, state);
+}
+
+void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
+ struct bset_sort_state *state)
+{
+ uint64_t start_time = local_clock();
+
+ struct btree_iter iter;
+ bch_btree_iter_init(b, &iter, NULL);
+
+ btree_mergesort(b, new->set->data, &iter, false, true);
+
+ bch_time_stats_update(&state->time, start_time);
+
+ new->set->size = 0; // XXX: why?
+}
+
+#define SORT_CRIT (4096 / sizeof(uint64_t))
+
+void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
+{
+ unsigned crit = SORT_CRIT;
+ int i;
+
+ /* Don't sort if nothing to do */
+ if (!b->nsets)
+ goto out;
+
+ for (i = b->nsets - 1; i >= 0; --i) {
+ crit *= state->crit_factor;
+
+ if (b->set[i].data->keys < crit) {
+ bch_btree_sort_partial(b, i, state);
+ return;
+ }
+ }
+
+ /* Sort if we'd overflow */
+ if (b->nsets + 1 == MAX_BSETS) {
+ bch_btree_sort(b, state);
+ return;
+ }
+
+out:
+ bch_bset_build_written_tree(b);
+}
+EXPORT_SYMBOL(bch_btree_sort_lazy);
+
+void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
+{
+ unsigned i;
+
+ for (i = 0; i <= b->nsets; i++) {
+ struct bset_tree *t = &b->set[i];
+ size_t bytes = t->data->keys * sizeof(uint64_t);
+ size_t j;
+
+ if (bset_written(b, t)) {
+ stats->sets_written++;
+ stats->bytes_written += bytes;
+
+ stats->floats += t->size - 1;
+
+ for (j = 1; j < t->size; j++)
+ if (t->tree[j].exponent == 127)
+ stats->failed++;
+ } else {
+ stats->sets_unwritten++;
+ stats->bytes_unwritten += bytes;
+ }
+ }
+}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
new file mode 100644
index 000000000..ae964624e
--- /dev/null
+++ b/drivers/md/bcache/bset.h
@@ -0,0 +1,566 @@
+#ifndef _BCACHE_BSET_H
+#define _BCACHE_BSET_H
+
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "util.h" /* for time_stats */
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bch_ptr_invalid and
+ * bch_ptr_bad().
+ *
+ * bch_ptr_invalid() primarily filters out keys and pointers that would be
+ * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
+ * pointer that occur in normal practice but don't point to real data.
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that, we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking. Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n. The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+struct btree_keys;
+struct btree_iter;
+struct btree_iter_set;
+struct bkey_float;
+
+#define MAX_BSETS 4U
+
+struct bset_tree {
+ /*
+ * We construct a binary tree in an array as if the array
+ * started at 1, so that things line up on the same cachelines
+ * better: see comments in bset.c at cacheline_to_bkey() for
+ * details
+ */
+
+ /* size of the binary tree and prev array */
+ unsigned size;
+
+ /* function of size - precalculated for to_inorder() */
+ unsigned extra;
+
+ /* copy of the last key in the set */
+ struct bkey end;
+ struct bkey_float *tree;
+
+ /*
+ * The nodes in the bset tree point to specific keys - this
+ * array holds the sizes of the previous key.
+ *
+ * Conceptually it's a member of struct bkey_float, but we want
+ * to keep bkey_float to 4 bytes and prev isn't used in the fast
+ * path.
+ */
+ uint8_t *prev;
+
+ /* The actual btree node, with pointers to each sorted set */
+ struct bset *data;
+};
+
+struct btree_keys_ops {
+ bool (*sort_cmp)(struct btree_iter_set,
+ struct btree_iter_set);
+ struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *);
+ bool (*insert_fixup)(struct btree_keys *, struct bkey *,
+ struct btree_iter *, struct bkey *);
+ bool (*key_invalid)(struct btree_keys *,
+ const struct bkey *);
+ bool (*key_bad)(struct btree_keys *, const struct bkey *);
+ bool (*key_merge)(struct btree_keys *,
+ struct bkey *, struct bkey *);
+ void (*key_to_text)(char *, size_t, const struct bkey *);
+ void (*key_dump)(struct btree_keys *, const struct bkey *);
+
+ /*
+ * Only used for deciding whether to use START_KEY(k) or just the key
+ * itself in a couple places
+ */
+ bool is_extents;
+};
+
+struct btree_keys {
+ const struct btree_keys_ops *ops;
+ uint8_t page_order;
+ uint8_t nsets;
+ unsigned last_set_unwritten:1;
+ bool *expensive_debug_checks;
+
+ /*
+ * Sets of sorted keys - the real btree node - plus a binary search tree
+ *
+ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+ * to the memory we have allocated for this btree node. Additionally,
+ * set[0]->data points to the entire btree node as it exists on disk.
+ */
+ struct bset_tree set[MAX_BSETS];
+};
+
+static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
+{
+ return b->set + b->nsets;
+}
+
+static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
+{
+ return t <= b->set + b->nsets - b->last_set_unwritten;
+}
+
+static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
+{
+ return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
+}
+
+static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
+{
+ return ((size_t) i) - ((size_t) b->set->data);
+}
+
+static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
+{
+ return bset_byte_offset(b, i) >> 9;
+}
+
+#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i) __set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, block_bytes) \
+ DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
+#define set_blocks(i, block_bytes) \
+ __set_blocks(i, (i)->keys, block_bytes)
+
+static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
+{
+ struct bset_tree *t = bset_tree_last(b);
+
+ BUG_ON((PAGE_SIZE << b->page_order) <
+ (bset_byte_offset(b, t->data) + set_bytes(t->data)));
+
+ if (!b->last_set_unwritten)
+ return 0;
+
+ return ((PAGE_SIZE << b->page_order) -
+ (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
+ sizeof(u64);
+}
+
+static inline struct bset *bset_next_set(struct btree_keys *b,
+ unsigned block_bytes)
+{
+ struct bset *i = bset_tree_last(b)->data;
+
+ return ((void *) i) + roundup(set_bytes(i), block_bytes);
+}
+
+void bch_btree_keys_free(struct btree_keys *);
+int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
+ bool *);
+
+void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
+void bch_bset_build_written_tree(struct btree_keys *);
+void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
+bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
+void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
+unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
+ struct bkey *);
+
+enum {
+ BTREE_INSERT_STATUS_NO_INSERT = 0,
+ BTREE_INSERT_STATUS_INSERT,
+ BTREE_INSERT_STATUS_BACK_MERGE,
+ BTREE_INSERT_STATUS_OVERWROTE,
+ BTREE_INSERT_STATUS_FRONT_MERGE,
+};
+
+/* Btree key iteration */
+
+struct btree_iter {
+ size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+ struct btree_keys *b;
+#endif
+ struct btree_iter_set {
+ struct bkey *k, *end;
+ } data[MAX_BSETS];
+};
+
+typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
+
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+ struct btree_keys *, ptr_filter_fn);
+
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
+ struct bkey *);
+
+struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
+ const struct bkey *);
+
+/*
+ * Returns the first key that is strictly greater than search
+ */
+static inline struct bkey *bch_bset_search(struct btree_keys *b,
+ struct bset_tree *t,
+ const struct bkey *search)
+{
+ return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+
+#define for_each_key_filter(b, k, iter, filter) \
+ for (bch_btree_iter_init((b), (iter), NULL); \
+ ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+
+#define for_each_key(b, k, iter) \
+ for (bch_btree_iter_init((b), (iter), NULL); \
+ ((k) = bch_btree_iter_next(iter));)
+
+/* Sorting */
+
+struct bset_sort_state {
+ mempool_t *pool;
+
+ unsigned page_order;
+ unsigned crit_factor;
+
+ struct time_stats time;
+};
+
+void bch_bset_sort_state_free(struct bset_sort_state *);
+int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
+void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
+void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
+ struct bset_sort_state *);
+void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
+ struct bset_sort_state *);
+void bch_btree_sort_partial(struct btree_keys *, unsigned,
+ struct bset_sort_state *);
+
+static inline void bch_btree_sort(struct btree_keys *b,
+ struct bset_sort_state *state)
+{
+ bch_btree_sort_partial(b, 0, state);
+}
+
+struct bset_stats {
+ size_t sets_written, sets_unwritten;
+ size_t bytes_written, bytes_unwritten;
+ size_t floats, failed;
+};
+
+void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
+
+/* Bkey utility code */
+
+#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
+
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+ return bkey_idx(i->start, idx);
+}
+
+static inline void bkey_init(struct bkey *k)
+{
+ *k = ZERO_KEY;
+}
+
+static __always_inline int64_t bkey_cmp(const struct bkey *l,
+ const struct bkey *r)
+{
+ return unlikely(KEY_INODE(l) != KEY_INODE(r))
+ ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
+ : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
+}
+
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+ unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+ BUG_ON(bkey_cmp(where, k) > 0);
+ return __bch_cut_front(where, k);
+}
+
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+ BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+ return __bch_cut_back(where, k);
+}
+
+#define PRECEDING_KEY(_k) \
+({ \
+ struct bkey *_ret = NULL; \
+ \
+ if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
+ _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
+ \
+ if (!_ret->low) \
+ _ret->high--; \
+ _ret->low--; \
+ } \
+ \
+ _ret; \
+})
+
+static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
+{
+ return b->ops->key_invalid(b, k);
+}
+
+static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
+{
+ return b->ops->key_bad(b, k);
+}
+
+static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
+ size_t size, const struct bkey *k)
+{
+ return b->ops->key_to_text(buf, size, k);
+}
+
+static inline bool bch_bkey_equal_header(const struct bkey *l,
+ const struct bkey *r)
+{
+ return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
+ KEY_PTRS(l) == KEY_PTRS(r) &&
+ KEY_CSUM(l) == KEY_CSUM(r));
+}
+
+/* Keylists */
+
+struct keylist {
+ union {
+ struct bkey *keys;
+ uint64_t *keys_p;
+ };
+ union {
+ struct bkey *top;
+ uint64_t *top_p;
+ };
+
+ /* Enough room for btree_split's keys without realloc */
+#define KEYLIST_INLINE 16
+ uint64_t inline_keys[KEYLIST_INLINE];
+};
+
+static inline void bch_keylist_init(struct keylist *l)
+{
+ l->top_p = l->keys_p = l->inline_keys;
+}
+
+static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
+{
+ l->keys = k;
+ l->top = bkey_next(k);
+}
+
+static inline void bch_keylist_push(struct keylist *l)
+{
+ l->top = bkey_next(l->top);
+}
+
+static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
+{
+ bkey_copy(l->top, k);
+ bch_keylist_push(l);
+}
+
+static inline bool bch_keylist_empty(struct keylist *l)
+{
+ return l->top == l->keys;
+}
+
+static inline void bch_keylist_reset(struct keylist *l)
+{
+ l->top = l->keys;
+}
+
+static inline void bch_keylist_free(struct keylist *l)
+{
+ if (l->keys_p != l->inline_keys)
+ kfree(l->keys_p);
+}
+
+static inline size_t bch_keylist_nkeys(struct keylist *l)
+{
+ return l->top_p - l->keys_p;
+}
+
+static inline size_t bch_keylist_bytes(struct keylist *l)
+{
+ return bch_keylist_nkeys(l) * sizeof(uint64_t);
+}
+
+struct bkey *bch_keylist_pop(struct keylist *);
+void bch_keylist_pop_front(struct keylist *);
+int __bch_keylist_realloc(struct keylist *, unsigned);
+
+/* Debug stuff */
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+int __bch_count_data(struct btree_keys *);
+void __bch_check_keys(struct btree_keys *, const char *, ...);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bucket(struct btree_keys *);
+
+#else
+
+static inline int __bch_count_data(struct btree_keys *b) { return -1; }
+static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void bch_dump_bucket(struct btree_keys *b) {}
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+
+#endif
+
+static inline bool btree_keys_expensive_checks(struct btree_keys *b)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+ return *b->expensive_debug_checks;
+#else
+ return false;
+#endif
+}
+
+static inline int bch_count_data(struct btree_keys *b)
+{
+ return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
+}
+
+#define bch_check_keys(b, ...) \
+do { \
+ if (btree_keys_expensive_checks(b)) \
+ __bch_check_keys(b, __VA_ARGS__); \
+} while (0)
+
+#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
new file mode 100644
index 000000000..00cde40db
--- /dev/null
+++ b/drivers/md/bcache/btree.c
@@ -0,0 +1,2530 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+/*
+ * Todo:
+ * register_bcache: Return errors out to userspace correctly
+ *
+ * Writeback: don't undirty key until after a cache flush
+ *
+ * Create an iterator for key pointers
+ *
+ * On btree write error, mark bucket such that it won't be freed from the cache
+ *
+ * Journalling:
+ * Check for bad keys in replay
+ * Propagate barriers
+ * Refcount journal entries in journal_replay
+ *
+ * Garbage collection:
+ * Finish incremental gc
+ * Gc should free old UUIDs, data for invalid UUIDs
+ *
+ * Provide a way to list backing device UUIDs we have data cached for, and
+ * probably how long it's been since we've seen them, and a way to invalidate
+ * dirty data for devices that will never be attached again
+ *
+ * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
+ * that based on that and how much dirty data we have we can keep writeback
+ * from being starved
+ *
+ * Add a tracepoint or somesuch to watch for writeback starvation
+ *
+ * When btree depth > 1 and splitting an interior node, we have to make sure
+ * alloc_bucket() cannot fail. This should be true but is not completely
+ * obvious.
+ *
+ * Plugging?
+ *
+ * If data write is less than hard sector size of ssd, round up offset in open
+ * bucket to the next whole sector
+ *
+ * Superblock needs to be fleshed out for multiple cache devices
+ *
+ * Add a sysfs tunable for the number of writeback IOs in flight
+ *
+ * Add a sysfs tunable for the number of open data buckets
+ *
+ * IO tracking: Can we track when one process is doing io on behalf of another?
+ * IO tracking: Don't use just an average, weigh more recent stuff higher
+ *
+ * Test module load/unload
+ */
+
+#define MAX_NEED_GC 64
+#define MAX_SAVE_PRIO 72
+
+#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
+
+#define PTR_HASH(c, k) \
+ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
+
+#define insert_lock(s, b) ((b)->level <= (s)->lock)
+
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+
+/**
+ * btree - recurse down the btree on a specified key
+ * @fn: function to call, which will be passed the child node
+ * @key: key to recurse on
+ * @b: parent btree node
+ * @op: pointer to struct btree_op
+ */
+#define btree(fn, key, b, op, ...) \
+({ \
+ int _r, l = (b)->level - 1; \
+ bool _w = l <= (op)->lock; \
+ struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \
+ _w, b); \
+ if (!IS_ERR(_child)) { \
+ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
+ rw_unlock(_w, _child); \
+ } else \
+ _r = PTR_ERR(_child); \
+ _r; \
+})
+
+/**
+ * btree_root - call a function on the root of the btree
+ * @fn: function to call, which will be passed the child node
+ * @c: cache set
+ * @op: pointer to struct btree_op
+ */
+#define btree_root(fn, c, op, ...) \
+({ \
+ int _r = -EINTR; \
+ do { \
+ struct btree *_b = (c)->root; \
+ bool _w = insert_lock(op, _b); \
+ rw_lock(_w, _b, _b->level); \
+ if (_b == (c)->root && \
+ _w == insert_lock(op, _b)) { \
+ _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
+ } \
+ rw_unlock(_w, _b); \
+ bch_cannibalize_unlock(c); \
+ if (_r == -EINTR) \
+ schedule(); \
+ } while (_r == -EINTR); \
+ \
+ finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
+ _r; \
+})
+
+static inline struct bset *write_block(struct btree *b)
+{
+ return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+}
+
+static void bch_btree_init_next(struct btree *b)
+{
+ /* If not a leaf node, always sort */
+ if (b->level && b->keys.nsets)
+ bch_btree_sort(&b->keys, &b->c->sort);
+ else
+ bch_btree_sort_lazy(&b->keys, &b->c->sort);
+
+ if (b->written < btree_blocks(b))
+ bch_bset_init_next(&b->keys, write_block(b),
+ bset_magic(&b->c->sb));
+
+}
+
+/* Btree key manipulation */
+
+void bkey_put(struct cache_set *c, struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i))
+ atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
+}
+
+/* Btree IO */
+
+static uint64_t btree_csum_set(struct btree *b, struct bset *i)
+{
+ uint64_t crc = b->key.ptr[0];
+ void *data = (void *) i + 8, *end = bset_bkey_last(i);
+
+ crc = bch_crc64_update(crc, data, end - data);
+ return crc ^ 0xffffffffffffffffULL;
+}
+
+void bch_btree_node_read_done(struct btree *b)
+{
+ const char *err = "bad btree header";
+ struct bset *i = btree_bset_first(b);
+ struct btree_iter *iter;
+
+ iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
+ iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
+ iter->used = 0;
+
+#ifdef CONFIG_BCACHE_DEBUG
+ iter->b = &b->keys;
+#endif
+
+ if (!i->seq)
+ goto err;
+
+ for (;
+ b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
+ i = write_block(b)) {
+ err = "unsupported bset version";
+ if (i->version > BCACHE_BSET_VERSION)
+ goto err;
+
+ err = "bad btree header";
+ if (b->written + set_blocks(i, block_bytes(b->c)) >
+ btree_blocks(b))
+ goto err;
+
+ err = "bad magic";
+ if (i->magic != bset_magic(&b->c->sb))
+ goto err;
+
+ err = "bad checksum";
+ switch (i->version) {
+ case 0:
+ if (i->csum != csum_set(i))
+ goto err;
+ break;
+ case BCACHE_BSET_VERSION:
+ if (i->csum != btree_csum_set(b, i))
+ goto err;
+ break;
+ }
+
+ err = "empty set";
+ if (i != b->keys.set[0].data && !i->keys)
+ goto err;
+
+ bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
+
+ b->written += set_blocks(i, block_bytes(b->c));
+ }
+
+ err = "corrupted btree";
+ for (i = write_block(b);
+ bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
+ i = ((void *) i) + block_bytes(b->c))
+ if (i->seq == b->keys.set[0].data->seq)
+ goto err;
+
+ bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
+
+ i = b->keys.set[0].data;
+ err = "short btree key";
+ if (b->keys.set[0].size &&
+ bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
+ goto err;
+
+ if (b->written < btree_blocks(b))
+ bch_bset_init_next(&b->keys, write_block(b),
+ bset_magic(&b->c->sb));
+out:
+ mempool_free(iter, b->c->fill_iter);
+ return;
+err:
+ set_btree_node_io_error(b);
+ bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
+ err, PTR_BUCKET_NR(b->c, &b->key, 0),
+ bset_block_offset(b, i), i->keys);
+ goto out;
+}
+
+static void btree_node_read_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ closure_put(cl);
+}
+
+static void bch_btree_node_read(struct btree *b)
+{
+ uint64_t start_time = local_clock();
+ struct closure cl;
+ struct bio *bio;
+
+ trace_bcache_btree_read(b);
+
+ closure_init_stack(&cl);
+
+ bio = bch_bbio_alloc(b->c);
+ bio->bi_rw = REQ_META|READ_SYNC;
+ bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = &cl;
+
+ bch_bio_map(bio, b->keys.set[0].data);
+
+ bch_submit_bbio(bio, b->c, &b->key, 0);
+ closure_sync(&cl);
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ set_btree_node_io_error(b);
+
+ bch_bbio_free(bio, b->c);
+
+ if (btree_node_io_error(b))
+ goto err;
+
+ bch_btree_node_read_done(b);
+ bch_time_stats_update(&b->c->btree_read_time, start_time);
+
+ return;
+err:
+ bch_cache_set_error(b->c, "io error reading bucket %zu",
+ PTR_BUCKET_NR(b->c, &b->key, 0));
+}
+
+static void btree_complete_write(struct btree *b, struct btree_write *w)
+{
+ if (w->prio_blocked &&
+ !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
+ wake_up_allocators(b->c);
+
+ if (w->journal) {
+ atomic_dec_bug(w->journal);
+ __closure_wake_up(&b->c->journal.wait);
+ }
+
+ w->prio_blocked = 0;
+ w->journal = NULL;
+}
+
+static void btree_node_write_unlock(struct closure *cl)
+{
+ struct btree *b = container_of(cl, struct btree, io);
+
+ up(&b->io_mutex);
+}
+
+static void __btree_node_write_done(struct closure *cl)
+{
+ struct btree *b = container_of(cl, struct btree, io);
+ struct btree_write *w = btree_prev_write(b);
+
+ bch_bbio_free(b->bio, b->c);
+ b->bio = NULL;
+ btree_complete_write(b, w);
+
+ if (btree_node_dirty(b))
+ schedule_delayed_work(&b->work, 30 * HZ);
+
+ closure_return_with_destructor(cl, btree_node_write_unlock);
+}
+
+static void btree_node_write_done(struct closure *cl)
+{
+ struct btree *b = container_of(cl, struct btree, io);
+ struct bio_vec *bv;
+ int n;
+
+ bio_for_each_segment_all(bv, b->bio, n)
+ __free_page(bv->bv_page);
+
+ __btree_node_write_done(cl);
+}
+
+static void btree_node_write_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct btree *b = container_of(cl, struct btree, io);
+
+ if (error)
+ set_btree_node_io_error(b);
+
+ bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
+ closure_put(cl);
+}
+
+static void do_btree_node_write(struct btree *b)
+{
+ struct closure *cl = &b->io;
+ struct bset *i = btree_bset_last(b);
+ BKEY_PADDED(key) k;
+
+ i->version = BCACHE_BSET_VERSION;
+ i->csum = btree_csum_set(b, i);
+
+ BUG_ON(b->bio);
+ b->bio = bch_bbio_alloc(b->c);
+
+ b->bio->bi_end_io = btree_node_write_endio;
+ b->bio->bi_private = cl;
+ b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
+ b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
+ bch_bio_map(b->bio, i);
+
+ /*
+ * If we're appending to a leaf node, we don't technically need FUA -
+ * this write just needs to be persisted before the next journal write,
+ * which will be marked FLUSH|FUA.
+ *
+ * Similarly if we're writing a new btree root - the pointer is going to
+ * be in the next journal entry.
+ *
+ * But if we're writing a new btree node (that isn't a root) or
+ * appending to a non leaf btree node, we need either FUA or a flush
+ * when we write the parent with the new pointer. FUA is cheaper than a
+ * flush, and writes appending to leaf nodes aren't blocking anything so
+ * just make all btree node writes FUA to keep things sane.
+ */
+
+ bkey_copy(&k.key, &b->key);
+ SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
+ bset_sector_offset(&b->keys, i));
+
+ if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+ int j;
+ struct bio_vec *bv;
+ void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+
+ bio_for_each_segment_all(bv, b->bio, j)
+ memcpy(page_address(bv->bv_page),
+ base + j * PAGE_SIZE, PAGE_SIZE);
+
+ bch_submit_bbio(b->bio, b->c, &k.key, 0);
+
+ continue_at(cl, btree_node_write_done, NULL);
+ } else {
+ b->bio->bi_vcnt = 0;
+ bch_bio_map(b->bio, i);
+
+ bch_submit_bbio(b->bio, b->c, &k.key, 0);
+
+ closure_sync(cl);
+ continue_at_nobarrier(cl, __btree_node_write_done, NULL);
+ }
+}
+
+void __bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+ struct bset *i = btree_bset_last(b);
+
+ lockdep_assert_held(&b->write_lock);
+
+ trace_bcache_btree_write(b);
+
+ BUG_ON(current->bio_list);
+ BUG_ON(b->written >= btree_blocks(b));
+ BUG_ON(b->written && !i->keys);
+ BUG_ON(btree_bset_first(b)->seq != i->seq);
+ bch_check_keys(&b->keys, "writing");
+
+ cancel_delayed_work(&b->work);
+
+ /* If caller isn't waiting for write, parent refcount is cache set */
+ down(&b->io_mutex);
+ closure_init(&b->io, parent ?: &b->c->cl);
+
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+ change_bit(BTREE_NODE_write_idx, &b->flags);
+
+ do_btree_node_write(b);
+
+ atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
+ &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
+
+ b->written += set_blocks(i, block_bytes(b->c));
+}
+
+void bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+ unsigned nsets = b->keys.nsets;
+
+ lockdep_assert_held(&b->lock);
+
+ __bch_btree_node_write(b, parent);
+
+ /*
+ * do verify if there was more than one set initially (i.e. we did a
+ * sort) and we sorted down to a single set:
+ */
+ if (nsets && !b->keys.nsets)
+ bch_btree_verify(b);
+
+ bch_btree_init_next(b);
+}
+
+static void bch_btree_node_write_sync(struct btree *b)
+{
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ mutex_lock(&b->write_lock);
+ bch_btree_node_write(b, &cl);
+ mutex_unlock(&b->write_lock);
+
+ closure_sync(&cl);
+}
+
+static void btree_node_write_work(struct work_struct *w)
+{
+ struct btree *b = container_of(to_delayed_work(w), struct btree, work);
+
+ mutex_lock(&b->write_lock);
+ if (btree_node_dirty(b))
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
+}
+
+static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
+{
+ struct bset *i = btree_bset_last(b);
+ struct btree_write *w = btree_current_write(b);
+
+ lockdep_assert_held(&b->write_lock);
+
+ BUG_ON(!b->written);
+ BUG_ON(!i->keys);
+
+ if (!btree_node_dirty(b))
+ schedule_delayed_work(&b->work, 30 * HZ);
+
+ set_btree_node_dirty(b);
+
+ if (journal_ref) {
+ if (w->journal &&
+ journal_pin_cmp(b->c, w->journal, journal_ref)) {
+ atomic_dec_bug(w->journal);
+ w->journal = NULL;
+ }
+
+ if (!w->journal) {
+ w->journal = journal_ref;
+ atomic_inc(w->journal);
+ }
+ }
+
+ /* Force write if set is too big */
+ if (set_bytes(i) > PAGE_SIZE - 48 &&
+ !current->bio_list)
+ bch_btree_node_write(b, NULL);
+}
+
+/*
+ * Btree in memory cache - allocation/freeing
+ * mca -> memory cache
+ */
+
+#define mca_reserve(c) (((c->root && c->root->level) \
+ ? c->root->level : 1) * 8 + 16)
+#define mca_can_free(c) \
+ max_t(int, 0, c->btree_cache_used - mca_reserve(c))
+
+static void mca_data_free(struct btree *b)
+{
+ BUG_ON(b->io_mutex.count != 1);
+
+ bch_btree_keys_free(&b->keys);
+
+ b->c->btree_cache_used--;
+ list_move(&b->list, &b->c->btree_cache_freed);
+}
+
+static void mca_bucket_free(struct btree *b)
+{
+ BUG_ON(btree_node_dirty(b));
+
+ b->key.ptr[0] = 0;
+ hlist_del_init_rcu(&b->hash);
+ list_move(&b->list, &b->c->btree_cache_freeable);
+}
+
+static unsigned btree_order(struct bkey *k)
+{
+ return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
+}
+
+static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
+{
+ if (!bch_btree_keys_alloc(&b->keys,
+ max_t(unsigned,
+ ilog2(b->c->btree_pages),
+ btree_order(k)),
+ gfp)) {
+ b->c->btree_cache_used++;
+ list_move(&b->list, &b->c->btree_cache);
+ } else {
+ list_move(&b->list, &b->c->btree_cache_freed);
+ }
+}
+
+static struct btree *mca_bucket_alloc(struct cache_set *c,
+ struct bkey *k, gfp_t gfp)
+{
+ struct btree *b = kzalloc(sizeof(struct btree), gfp);
+ if (!b)
+ return NULL;
+
+ init_rwsem(&b->lock);
+ lockdep_set_novalidate_class(&b->lock);
+ mutex_init(&b->write_lock);
+ lockdep_set_novalidate_class(&b->write_lock);
+ INIT_LIST_HEAD(&b->list);
+ INIT_DELAYED_WORK(&b->work, btree_node_write_work);
+ b->c = c;
+ sema_init(&b->io_mutex, 1);
+
+ mca_data_alloc(b, k, gfp);
+ return b;
+}
+
+static int mca_reap(struct btree *b, unsigned min_order, bool flush)
+{
+ struct closure cl;
+
+ closure_init_stack(&cl);
+ lockdep_assert_held(&b->c->bucket_lock);
+
+ if (!down_write_trylock(&b->lock))
+ return -ENOMEM;
+
+ BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
+
+ if (b->keys.page_order < min_order)
+ goto out_unlock;
+
+ if (!flush) {
+ if (btree_node_dirty(b))
+ goto out_unlock;
+
+ if (down_trylock(&b->io_mutex))
+ goto out_unlock;
+ up(&b->io_mutex);
+ }
+
+ mutex_lock(&b->write_lock);
+ if (btree_node_dirty(b))
+ __bch_btree_node_write(b, &cl);
+ mutex_unlock(&b->write_lock);
+
+ closure_sync(&cl);
+
+ /* wait for any in flight btree write */
+ down(&b->io_mutex);
+ up(&b->io_mutex);
+
+ return 0;
+out_unlock:
+ rw_unlock(true, b);
+ return -ENOMEM;
+}
+
+static unsigned long bch_mca_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+ struct btree *b, *t;
+ unsigned long i, nr = sc->nr_to_scan;
+ unsigned long freed = 0;
+
+ if (c->shrinker_disabled)
+ return SHRINK_STOP;
+
+ if (c->btree_cache_alloc_lock)
+ return SHRINK_STOP;
+
+ /* Return -1 if we can't do anything right now */
+ if (sc->gfp_mask & __GFP_IO)
+ mutex_lock(&c->bucket_lock);
+ else if (!mutex_trylock(&c->bucket_lock))
+ return -1;
+
+ /*
+ * It's _really_ critical that we don't free too many btree nodes - we
+ * have to always leave ourselves a reserve. The reserve is how we
+ * guarantee that allocating memory for a new btree node can always
+ * succeed, so that inserting keys into the btree can always succeed and
+ * IO can always make forward progress:
+ */
+ nr /= c->btree_pages;
+ nr = min_t(unsigned long, nr, mca_can_free(c));
+
+ i = 0;
+ list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+ if (freed >= nr)
+ break;
+
+ if (++i > 3 &&
+ !mca_reap(b, 0, false)) {
+ mca_data_free(b);
+ rw_unlock(true, b);
+ freed++;
+ }
+ }
+
+ for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
+ if (list_empty(&c->btree_cache))
+ goto out;
+
+ b = list_first_entry(&c->btree_cache, struct btree, list);
+ list_rotate_left(&c->btree_cache);
+
+ if (!b->accessed &&
+ !mca_reap(b, 0, false)) {
+ mca_bucket_free(b);
+ mca_data_free(b);
+ rw_unlock(true, b);
+ freed++;
+ } else
+ b->accessed = 0;
+ }
+out:
+ mutex_unlock(&c->bucket_lock);
+ return freed;
+}
+
+static unsigned long bch_mca_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+
+ if (c->shrinker_disabled)
+ return 0;
+
+ if (c->btree_cache_alloc_lock)
+ return 0;
+
+ return mca_can_free(c) * c->btree_pages;
+}
+
+void bch_btree_cache_free(struct cache_set *c)
+{
+ struct btree *b;
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ if (c->shrink.list.next)
+ unregister_shrinker(&c->shrink);
+
+ mutex_lock(&c->bucket_lock);
+
+#ifdef CONFIG_BCACHE_DEBUG
+ if (c->verify_data)
+ list_move(&c->verify_data->list, &c->btree_cache);
+
+ free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
+#endif
+
+ list_splice(&c->btree_cache_freeable,
+ &c->btree_cache);
+
+ while (!list_empty(&c->btree_cache)) {
+ b = list_first_entry(&c->btree_cache, struct btree, list);
+
+ if (btree_node_dirty(b))
+ btree_complete_write(b, btree_current_write(b));
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+
+ mca_data_free(b);
+ }
+
+ while (!list_empty(&c->btree_cache_freed)) {
+ b = list_first_entry(&c->btree_cache_freed,
+ struct btree, list);
+ list_del(&b->list);
+ cancel_delayed_work_sync(&b->work);
+ kfree(b);
+ }
+
+ mutex_unlock(&c->bucket_lock);
+}
+
+int bch_btree_cache_alloc(struct cache_set *c)
+{
+ unsigned i;
+
+ for (i = 0; i < mca_reserve(c); i++)
+ if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
+ return -ENOMEM;
+
+ list_splice_init(&c->btree_cache,
+ &c->btree_cache_freeable);
+
+#ifdef CONFIG_BCACHE_DEBUG
+ mutex_init(&c->verify_lock);
+
+ c->verify_ondisk = (void *)
+ __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
+
+ c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+
+ if (c->verify_data &&
+ c->verify_data->keys.set->data)
+ list_del_init(&c->verify_data->list);
+ else
+ c->verify_data = NULL;
+#endif
+
+ c->shrink.count_objects = bch_mca_count;
+ c->shrink.scan_objects = bch_mca_scan;
+ c->shrink.seeks = 4;
+ c->shrink.batch = c->btree_pages * 2;
+ register_shrinker(&c->shrink);
+
+ return 0;
+}
+
+/* Btree in memory cache - hash table */
+
+static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
+{
+ return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
+}
+
+static struct btree *mca_find(struct cache_set *c, struct bkey *k)
+{
+ struct btree *b;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
+ if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
+ goto out;
+ b = NULL;
+out:
+ rcu_read_unlock();
+ return b;
+}
+
+static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+{
+ struct task_struct *old;
+
+ old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+ if (old && old != current) {
+ if (op)
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
+ TASK_UNINTERRUPTIBLE);
+ return -EINTR;
+ }
+
+ return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+ struct bkey *k)
+{
+ struct btree *b;
+
+ trace_bcache_btree_cache_cannibalize(c);
+
+ if (mca_cannibalize_lock(c, op))
+ return ERR_PTR(-EINTR);
+
+ list_for_each_entry_reverse(b, &c->btree_cache, list)
+ if (!mca_reap(b, btree_order(k), false))
+ return b;
+
+ list_for_each_entry_reverse(b, &c->btree_cache, list)
+ if (!mca_reap(b, btree_order(k), true))
+ return b;
+
+ WARN(1, "btree cache cannibalize failed\n");
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+static void bch_cannibalize_unlock(struct cache_set *c)
+{
+ if (c->btree_cache_alloc_lock == current) {
+ c->btree_cache_alloc_lock = NULL;
+ wake_up(&c->btree_cache_wait);
+ }
+}
+
+static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+ struct bkey *k, int level)
+{
+ struct btree *b;
+
+ BUG_ON(current->bio_list);
+
+ lockdep_assert_held(&c->bucket_lock);
+
+ if (mca_find(c, k))
+ return NULL;
+
+ /* btree_free() doesn't free memory; it sticks the node on the end of
+ * the list. Check if there's any freed nodes there:
+ */
+ list_for_each_entry(b, &c->btree_cache_freeable, list)
+ if (!mca_reap(b, btree_order(k), false))
+ goto out;
+
+ /* We never free struct btree itself, just the memory that holds the on
+ * disk node. Check the freed list before allocating a new one:
+ */
+ list_for_each_entry(b, &c->btree_cache_freed, list)
+ if (!mca_reap(b, 0, false)) {
+ mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
+ if (!b->keys.set[0].data)
+ goto err;
+ else
+ goto out;
+ }
+
+ b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
+ if (!b)
+ goto err;
+
+ BUG_ON(!down_write_trylock(&b->lock));
+ if (!b->keys.set->data)
+ goto err;
+out:
+ BUG_ON(b->io_mutex.count != 1);
+
+ bkey_copy(&b->key, k);
+ list_move(&b->list, &c->btree_cache);
+ hlist_del_init_rcu(&b->hash);
+ hlist_add_head_rcu(&b->hash, mca_hash(c, k));
+
+ lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
+ b->parent = (void *) ~0UL;
+ b->flags = 0;
+ b->written = 0;
+ b->level = level;
+
+ if (!b->level)
+ bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
+ &b->c->expensive_debug_checks);
+ else
+ bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
+ &b->c->expensive_debug_checks);
+
+ return b;
+err:
+ if (b)
+ rw_unlock(true, b);
+
+ b = mca_cannibalize(c, op, k);
+ if (!IS_ERR(b))
+ goto out;
+
+ return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * level and op->lock.
+ */
+struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
+ struct bkey *k, int level, bool write,
+ struct btree *parent)
+{
+ int i = 0;
+ struct btree *b;
+
+ BUG_ON(level < 0);
+retry:
+ b = mca_find(c, k);
+
+ if (!b) {
+ if (current->bio_list)
+ return ERR_PTR(-EAGAIN);
+
+ mutex_lock(&c->bucket_lock);
+ b = mca_alloc(c, op, k, level);
+ mutex_unlock(&c->bucket_lock);
+
+ if (!b)
+ goto retry;
+ if (IS_ERR(b))
+ return b;
+
+ bch_btree_node_read(b);
+
+ if (!write)
+ downgrade_write(&b->lock);
+ } else {
+ rw_lock(write, b, level);
+ if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
+ rw_unlock(write, b);
+ goto retry;
+ }
+ BUG_ON(b->level != level);
+ }
+
+ b->parent = parent;
+ b->accessed = 1;
+
+ for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
+ prefetch(b->keys.set[i].tree);
+ prefetch(b->keys.set[i].data);
+ }
+
+ for (; i <= b->keys.nsets; i++)
+ prefetch(b->keys.set[i].data);
+
+ if (btree_node_io_error(b)) {
+ rw_unlock(write, b);
+ return ERR_PTR(-EIO);
+ }
+
+ BUG_ON(!b->written);
+
+ return b;
+}
+
+static void btree_node_prefetch(struct btree *parent, struct bkey *k)
+{
+ struct btree *b;
+
+ mutex_lock(&parent->c->bucket_lock);
+ b = mca_alloc(parent->c, NULL, k, parent->level - 1);
+ mutex_unlock(&parent->c->bucket_lock);
+
+ if (!IS_ERR_OR_NULL(b)) {
+ b->parent = parent;
+ bch_btree_node_read(b);
+ rw_unlock(true, b);
+ }
+}
+
+/* Btree alloc */
+
+static void btree_node_free(struct btree *b)
+{
+ trace_bcache_btree_node_free(b);
+
+ BUG_ON(b == b->c->root);
+
+ mutex_lock(&b->write_lock);
+
+ if (btree_node_dirty(b))
+ btree_complete_write(b, btree_current_write(b));
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+
+ mutex_unlock(&b->write_lock);
+
+ cancel_delayed_work(&b->work);
+
+ mutex_lock(&b->c->bucket_lock);
+ bch_bucket_free(b->c, &b->key);
+ mca_bucket_free(b);
+ mutex_unlock(&b->c->bucket_lock);
+}
+
+struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+ int level, bool wait,
+ struct btree *parent)
+{
+ BKEY_PADDED(key) k;
+ struct btree *b = ERR_PTR(-EAGAIN);
+
+ mutex_lock(&c->bucket_lock);
+retry:
+ if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
+ goto err;
+
+ bkey_put(c, &k.key);
+ SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
+
+ b = mca_alloc(c, op, &k.key, level);
+ if (IS_ERR(b))
+ goto err_free;
+
+ if (!b) {
+ cache_bug(c,
+ "Tried to allocate bucket that was in btree cache");
+ goto retry;
+ }
+
+ b->accessed = 1;
+ b->parent = parent;
+ bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
+
+ mutex_unlock(&c->bucket_lock);
+
+ trace_bcache_btree_node_alloc(b);
+ return b;
+err_free:
+ bch_bucket_free(c, &k.key);
+err:
+ mutex_unlock(&c->bucket_lock);
+
+ trace_bcache_btree_node_alloc_fail(c);
+ return b;
+}
+
+static struct btree *bch_btree_node_alloc(struct cache_set *c,
+ struct btree_op *op, int level,
+ struct btree *parent)
+{
+ return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
+}
+
+static struct btree *btree_node_alloc_replacement(struct btree *b,
+ struct btree_op *op)
+{
+ struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
+ if (!IS_ERR_OR_NULL(n)) {
+ mutex_lock(&n->write_lock);
+ bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
+ bkey_copy_key(&n->key, &b->key);
+ mutex_unlock(&n->write_lock);
+ }
+
+ return n;
+}
+
+static void make_btree_freeing_key(struct btree *b, struct bkey *k)
+{
+ unsigned i;
+
+ mutex_lock(&b->c->bucket_lock);
+
+ atomic_inc(&b->c->prio_blocked);
+
+ bkey_copy(k, &b->key);
+ bkey_copy_key(k, &ZERO_KEY);
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ SET_PTR_GEN(k, i,
+ bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+ PTR_BUCKET(b->c, &b->key, i)));
+
+ mutex_unlock(&b->c->bucket_lock);
+}
+
+static int btree_check_reserve(struct btree *b, struct btree_op *op)
+{
+ struct cache_set *c = b->c;
+ struct cache *ca;
+ unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
+
+ mutex_lock(&c->bucket_lock);
+
+ for_each_cache(ca, c, i)
+ if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
+ if (op)
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&c->bucket_lock);
+ return -EINTR;
+ }
+
+ mutex_unlock(&c->bucket_lock);
+
+ return mca_cannibalize_lock(b->c, op);
+}
+
+/* Garbage collection */
+
+static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
+ struct bkey *k)
+{
+ uint8_t stale = 0;
+ unsigned i;
+ struct bucket *g;
+
+ /*
+ * ptr_invalid() can't return true for the keys that mark btree nodes as
+ * freed, but since ptr_bad() returns true we'll never actually use them
+ * for anything and thus we don't want mark their pointers here
+ */
+ if (!bkey_cmp(k, &ZERO_KEY))
+ return stale;
+
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ if (!ptr_available(c, k, i))
+ continue;
+
+ g = PTR_BUCKET(c, k, i);
+
+ if (gen_after(g->last_gc, PTR_GEN(k, i)))
+ g->last_gc = PTR_GEN(k, i);
+
+ if (ptr_stale(c, k, i)) {
+ stale = max(stale, ptr_stale(c, k, i));
+ continue;
+ }
+
+ cache_bug_on(GC_MARK(g) &&
+ (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
+ c, "inconsistent ptrs: mark = %llu, level = %i",
+ GC_MARK(g), level);
+
+ if (level)
+ SET_GC_MARK(g, GC_MARK_METADATA);
+ else if (KEY_DIRTY(k))
+ SET_GC_MARK(g, GC_MARK_DIRTY);
+ else if (!GC_MARK(g))
+ SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
+
+ /* guard against overflow */
+ SET_GC_SECTORS_USED(g, min_t(unsigned,
+ GC_SECTORS_USED(g) + KEY_SIZE(k),
+ MAX_GC_SECTORS_USED));
+
+ BUG_ON(!GC_SECTORS_USED(g));
+ }
+
+ return stale;
+}
+
+#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
+
+void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i) &&
+ !ptr_stale(c, k, i)) {
+ struct bucket *b = PTR_BUCKET(c, k, i);
+
+ b->gen = PTR_GEN(k, i);
+
+ if (level && bkey_cmp(k, &ZERO_KEY))
+ b->prio = BTREE_PRIO;
+ else if (!level && b->prio == BTREE_PRIO)
+ b->prio = INITIAL_PRIO;
+ }
+
+ __bch_btree_mark_key(c, level, k);
+}
+
+static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
+{
+ uint8_t stale = 0;
+ unsigned keys = 0, good_keys = 0;
+ struct bkey *k;
+ struct btree_iter iter;
+ struct bset_tree *t;
+
+ gc->nodes++;
+
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
+ stale = max(stale, btree_mark_key(b, k));
+ keys++;
+
+ if (bch_ptr_bad(&b->keys, k))
+ continue;
+
+ gc->key_bytes += bkey_u64s(k);
+ gc->nkeys++;
+ good_keys++;
+
+ gc->data += KEY_SIZE(k);
+ }
+
+ for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
+ btree_bug_on(t->size &&
+ bset_written(&b->keys, t) &&
+ bkey_cmp(&b->key, &t->end) < 0,
+ b, "found short btree key in gc");
+
+ if (b->c->gc_always_rewrite)
+ return true;
+
+ if (stale > 10)
+ return true;
+
+ if ((keys - good_keys) * 2 > keys)
+ return true;
+
+ return false;
+}
+
+#define GC_MERGE_NODES 4U
+
+struct gc_merge_info {
+ struct btree *b;
+ unsigned keys;
+};
+
+static int bch_btree_insert_node(struct btree *, struct btree_op *,
+ struct keylist *, atomic_t *, struct bkey *);
+
+static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
+ struct gc_stat *gc, struct gc_merge_info *r)
+{
+ unsigned i, nodes = 0, keys = 0, blocks;
+ struct btree *new_nodes[GC_MERGE_NODES];
+ struct keylist keylist;
+ struct closure cl;
+ struct bkey *k;
+
+ bch_keylist_init(&keylist);
+
+ if (btree_check_reserve(b, NULL))
+ return 0;
+
+ memset(new_nodes, 0, sizeof(new_nodes));
+ closure_init_stack(&cl);
+
+ while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
+ keys += r[nodes++].keys;
+
+ blocks = btree_default_blocks(b->c) * 2 / 3;
+
+ if (nodes < 2 ||
+ __set_blocks(b->keys.set[0].data, keys,
+ block_bytes(b->c)) > blocks * (nodes - 1))
+ return 0;
+
+ for (i = 0; i < nodes; i++) {
+ new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
+ if (IS_ERR_OR_NULL(new_nodes[i]))
+ goto out_nocoalesce;
+ }
+
+ /*
+ * We have to check the reserve here, after we've allocated our new
+ * nodes, to make sure the insert below will succeed - we also check
+ * before as an optimization to potentially avoid a bunch of expensive
+ * allocs/sorts
+ */
+ if (btree_check_reserve(b, NULL))
+ goto out_nocoalesce;
+
+ for (i = 0; i < nodes; i++)
+ mutex_lock(&new_nodes[i]->write_lock);
+
+ for (i = nodes - 1; i > 0; --i) {
+ struct bset *n1 = btree_bset_first(new_nodes[i]);
+ struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
+ struct bkey *k, *last = NULL;
+
+ keys = 0;
+
+ if (i > 1) {
+ for (k = n2->start;
+ k < bset_bkey_last(n2);
+ k = bkey_next(k)) {
+ if (__set_blocks(n1, n1->keys + keys +
+ bkey_u64s(k),
+ block_bytes(b->c)) > blocks)
+ break;
+
+ last = k;
+ keys += bkey_u64s(k);
+ }
+ } else {
+ /*
+ * Last node we're not getting rid of - we're getting
+ * rid of the node at r[0]. Have to try and fit all of
+ * the remaining keys into this node; we can't ensure
+ * they will always fit due to rounding and variable
+ * length keys (shouldn't be possible in practice,
+ * though)
+ */
+ if (__set_blocks(n1, n1->keys + n2->keys,
+ block_bytes(b->c)) >
+ btree_blocks(new_nodes[i]))
+ goto out_nocoalesce;
+
+ keys = n2->keys;
+ /* Take the key of the node we're getting rid of */
+ last = &r->b->key;
+ }
+
+ BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
+ btree_blocks(new_nodes[i]));
+
+ if (last)
+ bkey_copy_key(&new_nodes[i]->key, last);
+
+ memcpy(bset_bkey_last(n1),
+ n2->start,
+ (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
+
+ n1->keys += keys;
+ r[i].keys = n1->keys;
+
+ memmove(n2->start,
+ bset_bkey_idx(n2, keys),
+ (void *) bset_bkey_last(n2) -
+ (void *) bset_bkey_idx(n2, keys));
+
+ n2->keys -= keys;
+
+ if (__bch_keylist_realloc(&keylist,
+ bkey_u64s(&new_nodes[i]->key)))
+ goto out_nocoalesce;
+
+ bch_btree_node_write(new_nodes[i], &cl);
+ bch_keylist_add(&keylist, &new_nodes[i]->key);
+ }
+
+ for (i = 0; i < nodes; i++)
+ mutex_unlock(&new_nodes[i]->write_lock);
+
+ closure_sync(&cl);
+
+ /* We emptied out this node */
+ BUG_ON(btree_bset_first(new_nodes[0])->keys);
+ btree_node_free(new_nodes[0]);
+ rw_unlock(true, new_nodes[0]);
+ new_nodes[0] = NULL;
+
+ for (i = 0; i < nodes; i++) {
+ if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
+ goto out_nocoalesce;
+
+ make_btree_freeing_key(r[i].b, keylist.top);
+ bch_keylist_push(&keylist);
+ }
+
+ bch_btree_insert_node(b, op, &keylist, NULL, NULL);
+ BUG_ON(!bch_keylist_empty(&keylist));
+
+ for (i = 0; i < nodes; i++) {
+ btree_node_free(r[i].b);
+ rw_unlock(true, r[i].b);
+
+ r[i].b = new_nodes[i];
+ }
+
+ memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
+ r[nodes - 1].b = ERR_PTR(-EINTR);
+
+ trace_bcache_btree_gc_coalesce(nodes);
+ gc->nodes--;
+
+ bch_keylist_free(&keylist);
+
+ /* Invalidated our iterator */
+ return -EINTR;
+
+out_nocoalesce:
+ closure_sync(&cl);
+ bch_keylist_free(&keylist);
+
+ while ((k = bch_keylist_pop(&keylist)))
+ if (!bkey_cmp(k, &ZERO_KEY))
+ atomic_dec(&b->c->prio_blocked);
+
+ for (i = 0; i < nodes; i++)
+ if (!IS_ERR_OR_NULL(new_nodes[i])) {
+ btree_node_free(new_nodes[i]);
+ rw_unlock(true, new_nodes[i]);
+ }
+ return 0;
+}
+
+static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
+ struct btree *replace)
+{
+ struct keylist keys;
+ struct btree *n;
+
+ if (btree_check_reserve(b, NULL))
+ return 0;
+
+ n = btree_node_alloc_replacement(replace, NULL);
+
+ /* recheck reserve after allocating replacement node */
+ if (btree_check_reserve(b, NULL)) {
+ btree_node_free(n);
+ rw_unlock(true, n);
+ return 0;
+ }
+
+ bch_btree_node_write_sync(n);
+
+ bch_keylist_init(&keys);
+ bch_keylist_add(&keys, &n->key);
+
+ make_btree_freeing_key(replace, keys.top);
+ bch_keylist_push(&keys);
+
+ bch_btree_insert_node(b, op, &keys, NULL, NULL);
+ BUG_ON(!bch_keylist_empty(&keys));
+
+ btree_node_free(replace);
+ rw_unlock(true, n);
+
+ /* Invalidated our iterator */
+ return -EINTR;
+}
+
+static unsigned btree_gc_count_keys(struct btree *b)
+{
+ struct bkey *k;
+ struct btree_iter iter;
+ unsigned ret = 0;
+
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
+ ret += bkey_u64s(k);
+
+ return ret;
+}
+
+static int btree_gc_recurse(struct btree *b, struct btree_op *op,
+ struct closure *writes, struct gc_stat *gc)
+{
+ int ret = 0;
+ bool should_rewrite;
+ struct bkey *k;
+ struct btree_iter iter;
+ struct gc_merge_info r[GC_MERGE_NODES];
+ struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
+
+ bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
+
+ for (i = r; i < r + ARRAY_SIZE(r); i++)
+ i->b = ERR_PTR(-EINTR);
+
+ while (1) {
+ k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
+ if (k) {
+ r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
+ true, b);
+ if (IS_ERR(r->b)) {
+ ret = PTR_ERR(r->b);
+ break;
+ }
+
+ r->keys = btree_gc_count_keys(r->b);
+
+ ret = btree_gc_coalesce(b, op, gc, r);
+ if (ret)
+ break;
+ }
+
+ if (!last->b)
+ break;
+
+ if (!IS_ERR(last->b)) {
+ should_rewrite = btree_gc_mark_node(last->b, gc);
+ if (should_rewrite) {
+ ret = btree_gc_rewrite_node(b, op, last->b);
+ if (ret)
+ break;
+ }
+
+ if (last->b->level) {
+ ret = btree_gc_recurse(last->b, op, writes, gc);
+ if (ret)
+ break;
+ }
+
+ bkey_copy_key(&b->c->gc_done, &last->b->key);
+
+ /*
+ * Must flush leaf nodes before gc ends, since replace
+ * operations aren't journalled
+ */
+ mutex_lock(&last->b->write_lock);
+ if (btree_node_dirty(last->b))
+ bch_btree_node_write(last->b, writes);
+ mutex_unlock(&last->b->write_lock);
+ rw_unlock(true, last->b);
+ }
+
+ memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
+ r->b = NULL;
+
+ if (need_resched()) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
+
+ for (i = r; i < r + ARRAY_SIZE(r); i++)
+ if (!IS_ERR_OR_NULL(i->b)) {
+ mutex_lock(&i->b->write_lock);
+ if (btree_node_dirty(i->b))
+ bch_btree_node_write(i->b, writes);
+ mutex_unlock(&i->b->write_lock);
+ rw_unlock(true, i->b);
+ }
+
+ return ret;
+}
+
+static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
+ struct closure *writes, struct gc_stat *gc)
+{
+ struct btree *n = NULL;
+ int ret = 0;
+ bool should_rewrite;
+
+ should_rewrite = btree_gc_mark_node(b, gc);
+ if (should_rewrite) {
+ n = btree_node_alloc_replacement(b, NULL);
+
+ if (!IS_ERR_OR_NULL(n)) {
+ bch_btree_node_write_sync(n);
+
+ bch_btree_set_root(n);
+ btree_node_free(b);
+ rw_unlock(true, n);
+
+ return -EINTR;
+ }
+ }
+
+ __bch_btree_mark_key(b->c, b->level + 1, &b->key);
+
+ if (b->level) {
+ ret = btree_gc_recurse(b, op, writes, gc);
+ if (ret)
+ return ret;
+ }
+
+ bkey_copy_key(&b->c->gc_done, &b->key);
+
+ return ret;
+}
+
+static void btree_gc_start(struct cache_set *c)
+{
+ struct cache *ca;
+ struct bucket *b;
+ unsigned i;
+
+ if (!c->gc_mark_valid)
+ return;
+
+ mutex_lock(&c->bucket_lock);
+
+ c->gc_mark_valid = 0;
+ c->gc_done = ZERO_KEY;
+
+ for_each_cache(ca, c, i)
+ for_each_bucket(b, ca) {
+ b->last_gc = b->gen;
+ if (!atomic_read(&b->pin)) {
+ SET_GC_MARK(b, 0);
+ SET_GC_SECTORS_USED(b, 0);
+ }
+ }
+
+ mutex_unlock(&c->bucket_lock);
+}
+
+static size_t bch_btree_gc_finish(struct cache_set *c)
+{
+ size_t available = 0;
+ struct bucket *b;
+ struct cache *ca;
+ unsigned i;
+
+ mutex_lock(&c->bucket_lock);
+
+ set_gc_sectors(c);
+ c->gc_mark_valid = 1;
+ c->need_gc = 0;
+
+ for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
+ SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
+ GC_MARK_METADATA);
+
+ /* don't reclaim buckets to which writeback keys point */
+ rcu_read_lock();
+ for (i = 0; i < c->nr_uuids; i++) {
+ struct bcache_device *d = c->devices[i];
+ struct cached_dev *dc;
+ struct keybuf_key *w, *n;
+ unsigned j;
+
+ if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+ dc = container_of(d, struct cached_dev, disk);
+
+ spin_lock(&dc->writeback_keys.lock);
+ rbtree_postorder_for_each_entry_safe(w, n,
+ &dc->writeback_keys.keys, node)
+ for (j = 0; j < KEY_PTRS(&w->key); j++)
+ SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
+ GC_MARK_DIRTY);
+ spin_unlock(&dc->writeback_keys.lock);
+ }
+ rcu_read_unlock();
+
+ for_each_cache(ca, c, i) {
+ uint64_t *i;
+
+ ca->invalidate_needs_gc = 0;
+
+ for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
+ SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+
+ for (i = ca->prio_buckets;
+ i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
+ SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+
+ for_each_bucket(b, ca) {
+ c->need_gc = max(c->need_gc, bucket_gc_gen(b));
+
+ if (atomic_read(&b->pin))
+ continue;
+
+ BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
+
+ if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
+ available++;
+ }
+ }
+
+ mutex_unlock(&c->bucket_lock);
+ return available;
+}
+
+static void bch_btree_gc(struct cache_set *c)
+{
+ int ret;
+ unsigned long available;
+ struct gc_stat stats;
+ struct closure writes;
+ struct btree_op op;
+ uint64_t start_time = local_clock();
+
+ trace_bcache_gc_start(c);
+
+ memset(&stats, 0, sizeof(struct gc_stat));
+ closure_init_stack(&writes);
+ bch_btree_op_init(&op, SHRT_MAX);
+
+ btree_gc_start(c);
+
+ do {
+ ret = btree_root(gc_root, c, &op, &writes, &stats);
+ closure_sync(&writes);
+
+ if (ret && ret != -EAGAIN)
+ pr_warn("gc failed!");
+ } while (ret);
+
+ available = bch_btree_gc_finish(c);
+ wake_up_allocators(c);
+
+ bch_time_stats_update(&c->btree_gc_time, start_time);
+
+ stats.key_bytes *= sizeof(uint64_t);
+ stats.data <<= 9;
+ stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
+ memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
+
+ trace_bcache_gc_end(c);
+
+ bch_moving_gc(c);
+}
+
+static int bch_gc_thread(void *arg)
+{
+ struct cache_set *c = arg;
+ struct cache *ca;
+ unsigned i;
+
+ while (1) {
+again:
+ bch_btree_gc(c);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+
+ mutex_lock(&c->bucket_lock);
+
+ for_each_cache(ca, c, i)
+ if (ca->invalidate_needs_gc) {
+ mutex_unlock(&c->bucket_lock);
+ set_current_state(TASK_RUNNING);
+ goto again;
+ }
+
+ mutex_unlock(&c->bucket_lock);
+
+ try_to_freeze();
+ schedule();
+ }
+
+ return 0;
+}
+
+int bch_gc_thread_start(struct cache_set *c)
+{
+ c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+ if (IS_ERR(c->gc_thread))
+ return PTR_ERR(c->gc_thread);
+
+ set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
+ return 0;
+}
+
+/* Initial partial gc */
+
+static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
+{
+ int ret = 0;
+ struct bkey *k, *p = NULL;
+ struct btree_iter iter;
+
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
+ bch_initial_mark_key(b->c, b->level, k);
+
+ bch_initial_mark_key(b->c, b->level + 1, &b->key);
+
+ if (b->level) {
+ bch_btree_iter_init(&b->keys, &iter, NULL);
+
+ do {
+ k = bch_btree_iter_next_filter(&iter, &b->keys,
+ bch_ptr_bad);
+ if (k)
+ btree_node_prefetch(b, k);
+
+ if (p)
+ ret = btree(check_recurse, p, b, op);
+
+ p = k;
+ } while (p && !ret);
+ }
+
+ return ret;
+}
+
+int bch_btree_check(struct cache_set *c)
+{
+ struct btree_op op;
+
+ bch_btree_op_init(&op, SHRT_MAX);
+
+ return btree_root(check_recurse, c, &op);
+}
+
+void bch_initial_gc_finish(struct cache_set *c)
+{
+ struct cache *ca;
+ struct bucket *b;
+ unsigned i;
+
+ bch_btree_gc_finish(c);
+
+ mutex_lock(&c->bucket_lock);
+
+ /*
+ * We need to put some unused buckets directly on the prio freelist in
+ * order to get the allocator thread started - it needs freed buckets in
+ * order to rewrite the prios and gens, and it needs to rewrite prios
+ * and gens in order to free buckets.
+ *
+ * This is only safe for buckets that have no live data in them, which
+ * there should always be some of.
+ */
+ for_each_cache(ca, c, i) {
+ for_each_bucket(b, ca) {
+ if (fifo_full(&ca->free[RESERVE_PRIO]))
+ break;
+
+ if (bch_can_invalidate_bucket(ca, b) &&
+ !GC_MARK(b)) {
+ __bch_invalidate_one_bucket(ca, b);
+ fifo_push(&ca->free[RESERVE_PRIO],
+ b - ca->buckets);
+ }
+ }
+ }
+
+ mutex_unlock(&c->bucket_lock);
+}
+
+/* Btree insertion */
+
+static bool btree_insert_key(struct btree *b, struct bkey *k,
+ struct bkey *replace_key)
+{
+ unsigned status;
+
+ BUG_ON(bkey_cmp(k, &b->key) > 0);
+
+ status = bch_btree_insert_key(&b->keys, k, replace_key);
+ if (status != BTREE_INSERT_STATUS_NO_INSERT) {
+ bch_check_keys(&b->keys, "%u for %s", status,
+ replace_key ? "replace" : "insert");
+
+ trace_bcache_btree_insert_key(b, k, replace_key != NULL,
+ status);
+ return true;
+ } else
+ return false;
+}
+
+static size_t insert_u64s_remaining(struct btree *b)
+{
+ long ret = bch_btree_keys_u64s_remaining(&b->keys);
+
+ /*
+ * Might land in the middle of an existing extent and have to split it
+ */
+ if (b->keys.ops->is_extents)
+ ret -= KEY_MAX_U64S;
+
+ return max(ret, 0L);
+}
+
+static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
+ struct keylist *insert_keys,
+ struct bkey *replace_key)
+{
+ bool ret = false;
+ int oldsize = bch_count_data(&b->keys);
+
+ while (!bch_keylist_empty(insert_keys)) {
+ struct bkey *k = insert_keys->keys;
+
+ if (bkey_u64s(k) > insert_u64s_remaining(b))
+ break;
+
+ if (bkey_cmp(k, &b->key) <= 0) {
+ if (!b->level)
+ bkey_put(b->c, k);
+
+ ret |= btree_insert_key(b, k, replace_key);
+ bch_keylist_pop_front(insert_keys);
+ } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
+ BKEY_PADDED(key) temp;
+ bkey_copy(&temp.key, insert_keys->keys);
+
+ bch_cut_back(&b->key, &temp.key);
+ bch_cut_front(&b->key, insert_keys->keys);
+
+ ret |= btree_insert_key(b, &temp.key, replace_key);
+ break;
+ } else {
+ break;
+ }
+ }
+
+ if (!ret)
+ op->insert_collision = true;
+
+ BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
+
+ BUG_ON(bch_count_data(&b->keys) < oldsize);
+ return ret;
+}
+
+static int btree_split(struct btree *b, struct btree_op *op,
+ struct keylist *insert_keys,
+ struct bkey *replace_key)
+{
+ bool split;
+ struct btree *n1, *n2 = NULL, *n3 = NULL;
+ uint64_t start_time = local_clock();
+ struct closure cl;
+ struct keylist parent_keys;
+
+ closure_init_stack(&cl);
+ bch_keylist_init(&parent_keys);
+
+ if (btree_check_reserve(b, op)) {
+ if (!b->level)
+ return -EINTR;
+ else
+ WARN(1, "insufficient reserve for split\n");
+ }
+
+ n1 = btree_node_alloc_replacement(b, op);
+ if (IS_ERR(n1))
+ goto err;
+
+ split = set_blocks(btree_bset_first(n1),
+ block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
+
+ if (split) {
+ unsigned keys = 0;
+
+ trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
+
+ n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
+ if (IS_ERR(n2))
+ goto err_free1;
+
+ if (!b->parent) {
+ n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
+ if (IS_ERR(n3))
+ goto err_free2;
+ }
+
+ mutex_lock(&n1->write_lock);
+ mutex_lock(&n2->write_lock);
+
+ bch_btree_insert_keys(n1, op, insert_keys, replace_key);
+
+ /*
+ * Has to be a linear search because we don't have an auxiliary
+ * search tree yet
+ */
+
+ while (keys < (btree_bset_first(n1)->keys * 3) / 5)
+ keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
+ keys));
+
+ bkey_copy_key(&n1->key,
+ bset_bkey_idx(btree_bset_first(n1), keys));
+ keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
+
+ btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
+ btree_bset_first(n1)->keys = keys;
+
+ memcpy(btree_bset_first(n2)->start,
+ bset_bkey_last(btree_bset_first(n1)),
+ btree_bset_first(n2)->keys * sizeof(uint64_t));
+
+ bkey_copy_key(&n2->key, &b->key);
+
+ bch_keylist_add(&parent_keys, &n2->key);
+ bch_btree_node_write(n2, &cl);
+ mutex_unlock(&n2->write_lock);
+ rw_unlock(true, n2);
+ } else {
+ trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
+
+ mutex_lock(&n1->write_lock);
+ bch_btree_insert_keys(n1, op, insert_keys, replace_key);
+ }
+
+ bch_keylist_add(&parent_keys, &n1->key);
+ bch_btree_node_write(n1, &cl);
+ mutex_unlock(&n1->write_lock);
+
+ if (n3) {
+ /* Depth increases, make a new root */
+ mutex_lock(&n3->write_lock);
+ bkey_copy_key(&n3->key, &MAX_KEY);
+ bch_btree_insert_keys(n3, op, &parent_keys, NULL);
+ bch_btree_node_write(n3, &cl);
+ mutex_unlock(&n3->write_lock);
+
+ closure_sync(&cl);
+ bch_btree_set_root(n3);
+ rw_unlock(true, n3);
+ } else if (!b->parent) {
+ /* Root filled up but didn't need to be split */
+ closure_sync(&cl);
+ bch_btree_set_root(n1);
+ } else {
+ /* Split a non root node */
+ closure_sync(&cl);
+ make_btree_freeing_key(b, parent_keys.top);
+ bch_keylist_push(&parent_keys);
+
+ bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
+ BUG_ON(!bch_keylist_empty(&parent_keys));
+ }
+
+ btree_node_free(b);
+ rw_unlock(true, n1);
+
+ bch_time_stats_update(&b->c->btree_split_time, start_time);
+
+ return 0;
+err_free2:
+ bkey_put(b->c, &n2->key);
+ btree_node_free(n2);
+ rw_unlock(true, n2);
+err_free1:
+ bkey_put(b->c, &n1->key);
+ btree_node_free(n1);
+ rw_unlock(true, n1);
+err:
+ WARN(1, "bcache: btree split failed (level %u)", b->level);
+
+ if (n3 == ERR_PTR(-EAGAIN) ||
+ n2 == ERR_PTR(-EAGAIN) ||
+ n1 == ERR_PTR(-EAGAIN))
+ return -EAGAIN;
+
+ return -ENOMEM;
+}
+
+static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
+ struct keylist *insert_keys,
+ atomic_t *journal_ref,
+ struct bkey *replace_key)
+{
+ struct closure cl;
+
+ BUG_ON(b->level && replace_key);
+
+ closure_init_stack(&cl);
+
+ mutex_lock(&b->write_lock);
+
+ if (write_block(b) != btree_bset_last(b) &&
+ b->keys.last_set_unwritten)
+ bch_btree_init_next(b); /* just wrote a set */
+
+ if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
+ mutex_unlock(&b->write_lock);
+ goto split;
+ }
+
+ BUG_ON(write_block(b) != btree_bset_last(b));
+
+ if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
+ if (!b->level)
+ bch_btree_leaf_dirty(b, journal_ref);
+ else
+ bch_btree_node_write(b, &cl);
+ }
+
+ mutex_unlock(&b->write_lock);
+
+ /* wait for btree node write if necessary, after unlock */
+ closure_sync(&cl);
+
+ return 0;
+split:
+ if (current->bio_list) {
+ op->lock = b->c->root->level + 1;
+ return -EAGAIN;
+ } else if (op->lock <= b->c->root->level) {
+ op->lock = b->c->root->level + 1;
+ return -EINTR;
+ } else {
+ /* Invalidated all iterators */
+ int ret = btree_split(b, op, insert_keys, replace_key);
+
+ if (bch_keylist_empty(insert_keys))
+ return 0;
+ else if (!ret)
+ return -EINTR;
+ return ret;
+ }
+}
+
+int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
+ struct bkey *check_key)
+{
+ int ret = -EINTR;
+ uint64_t btree_ptr = b->key.ptr[0];
+ unsigned long seq = b->seq;
+ struct keylist insert;
+ bool upgrade = op->lock == -1;
+
+ bch_keylist_init(&insert);
+
+ if (upgrade) {
+ rw_unlock(false, b);
+ rw_lock(true, b, b->level);
+
+ if (b->key.ptr[0] != btree_ptr ||
+ b->seq != seq + 1)
+ goto out;
+ }
+
+ SET_KEY_PTRS(check_key, 1);
+ get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
+
+ SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
+
+ bch_keylist_add(&insert, check_key);
+
+ ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
+
+ BUG_ON(!ret && !bch_keylist_empty(&insert));
+out:
+ if (upgrade)
+ downgrade_write(&b->lock);
+ return ret;
+}
+
+struct btree_insert_op {
+ struct btree_op op;
+ struct keylist *keys;
+ atomic_t *journal_ref;
+ struct bkey *replace_key;
+};
+
+static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
+{
+ struct btree_insert_op *op = container_of(b_op,
+ struct btree_insert_op, op);
+
+ int ret = bch_btree_insert_node(b, &op->op, op->keys,
+ op->journal_ref, op->replace_key);
+ if (ret && !bch_keylist_empty(op->keys))
+ return ret;
+ else
+ return MAP_DONE;
+}
+
+int bch_btree_insert(struct cache_set *c, struct keylist *keys,
+ atomic_t *journal_ref, struct bkey *replace_key)
+{
+ struct btree_insert_op op;
+ int ret = 0;
+
+ BUG_ON(current->bio_list);
+ BUG_ON(bch_keylist_empty(keys));
+
+ bch_btree_op_init(&op.op, 0);
+ op.keys = keys;
+ op.journal_ref = journal_ref;
+ op.replace_key = replace_key;
+
+ while (!ret && !bch_keylist_empty(keys)) {
+ op.op.lock = 0;
+ ret = bch_btree_map_leaf_nodes(&op.op, c,
+ &START_KEY(keys->keys),
+ btree_insert_fn);
+ }
+
+ if (ret) {
+ struct bkey *k;
+
+ pr_err("error %i", ret);
+
+ while ((k = bch_keylist_pop(keys)))
+ bkey_put(c, k);
+ } else if (op.op.insert_collision)
+ ret = -ESRCH;
+
+ return ret;
+}
+
+void bch_btree_set_root(struct btree *b)
+{
+ unsigned i;
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ trace_bcache_btree_set_root(b);
+
+ BUG_ON(!b->written);
+
+ for (i = 0; i < KEY_PTRS(&b->key); i++)
+ BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
+
+ mutex_lock(&b->c->bucket_lock);
+ list_del_init(&b->list);
+ mutex_unlock(&b->c->bucket_lock);
+
+ b->c->root = b;
+
+ bch_journal_meta(b->c, &cl);
+ closure_sync(&cl);
+}
+
+/* Map across nodes or keys */
+
+static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
+ struct bkey *from,
+ btree_map_nodes_fn *fn, int flags)
+{
+ int ret = MAP_CONTINUE;
+
+ if (b->level) {
+ struct bkey *k;
+ struct btree_iter iter;
+
+ bch_btree_iter_init(&b->keys, &iter, from);
+
+ while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
+ bch_ptr_bad))) {
+ ret = btree(map_nodes_recurse, k, b,
+ op, from, fn, flags);
+ from = NULL;
+
+ if (ret != MAP_CONTINUE)
+ return ret;
+ }
+ }
+
+ if (!b->level || flags == MAP_ALL_NODES)
+ ret = fn(op, b);
+
+ return ret;
+}
+
+int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
+ struct bkey *from, btree_map_nodes_fn *fn, int flags)
+{
+ return btree_root(map_nodes_recurse, c, op, from, fn, flags);
+}
+
+static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
+ struct bkey *from, btree_map_keys_fn *fn,
+ int flags)
+{
+ int ret = MAP_CONTINUE;
+ struct bkey *k;
+ struct btree_iter iter;
+
+ bch_btree_iter_init(&b->keys, &iter, from);
+
+ while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
+ ret = !b->level
+ ? fn(op, b, k)
+ : btree(map_keys_recurse, k, b, op, from, fn, flags);
+ from = NULL;
+
+ if (ret != MAP_CONTINUE)
+ return ret;
+ }
+
+ if (!b->level && (flags & MAP_END_KEY))
+ ret = fn(op, b, &KEY(KEY_INODE(&b->key),
+ KEY_OFFSET(&b->key), 0));
+
+ return ret;
+}
+
+int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
+ struct bkey *from, btree_map_keys_fn *fn, int flags)
+{
+ return btree_root(map_keys_recurse, c, op, from, fn, flags);
+}
+
+/* Keybuf code */
+
+static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
+{
+ /* Overlapping keys compare equal */
+ if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
+ return -1;
+ if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
+ return 1;
+ return 0;
+}
+
+static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
+ struct keybuf_key *r)
+{
+ return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
+}
+
+struct refill {
+ struct btree_op op;
+ unsigned nr_found;
+ struct keybuf *buf;
+ struct bkey *end;
+ keybuf_pred_fn *pred;
+};
+
+static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
+ struct bkey *k)
+{
+ struct refill *refill = container_of(op, struct refill, op);
+ struct keybuf *buf = refill->buf;
+ int ret = MAP_CONTINUE;
+
+ if (bkey_cmp(k, refill->end) >= 0) {
+ ret = MAP_DONE;
+ goto out;
+ }
+
+ if (!KEY_SIZE(k)) /* end key */
+ goto out;
+
+ if (refill->pred(buf, k)) {
+ struct keybuf_key *w;
+
+ spin_lock(&buf->lock);
+
+ w = array_alloc(&buf->freelist);
+ if (!w) {
+ spin_unlock(&buf->lock);
+ return MAP_DONE;
+ }
+
+ w->private = NULL;
+ bkey_copy(&w->key, k);
+
+ if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
+ array_free(&buf->freelist, w);
+ else
+ refill->nr_found++;
+
+ if (array_freelist_empty(&buf->freelist))
+ ret = MAP_DONE;
+
+ spin_unlock(&buf->lock);
+ }
+out:
+ buf->last_scanned = *k;
+ return ret;
+}
+
+void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
+ struct bkey *end, keybuf_pred_fn *pred)
+{
+ struct bkey start = buf->last_scanned;
+ struct refill refill;
+
+ cond_resched();
+
+ bch_btree_op_init(&refill.op, -1);
+ refill.nr_found = 0;
+ refill.buf = buf;
+ refill.end = end;
+ refill.pred = pred;
+
+ bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
+ refill_keybuf_fn, MAP_END_KEY);
+
+ trace_bcache_keyscan(refill.nr_found,
+ KEY_INODE(&start), KEY_OFFSET(&start),
+ KEY_INODE(&buf->last_scanned),
+ KEY_OFFSET(&buf->last_scanned));
+
+ spin_lock(&buf->lock);
+
+ if (!RB_EMPTY_ROOT(&buf->keys)) {
+ struct keybuf_key *w;
+ w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+ buf->start = START_KEY(&w->key);
+
+ w = RB_LAST(&buf->keys, struct keybuf_key, node);
+ buf->end = w->key;
+ } else {
+ buf->start = MAX_KEY;
+ buf->end = MAX_KEY;
+ }
+
+ spin_unlock(&buf->lock);
+}
+
+static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+ rb_erase(&w->node, &buf->keys);
+ array_free(&buf->freelist, w);
+}
+
+void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+ spin_lock(&buf->lock);
+ __bch_keybuf_del(buf, w);
+ spin_unlock(&buf->lock);
+}
+
+bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
+ struct bkey *end)
+{
+ bool ret = false;
+ struct keybuf_key *p, *w, s;
+ s.key = *start;
+
+ if (bkey_cmp(end, &buf->start) <= 0 ||
+ bkey_cmp(start, &buf->end) >= 0)
+ return false;
+
+ spin_lock(&buf->lock);
+ w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
+
+ while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
+ p = w;
+ w = RB_NEXT(w, node);
+
+ if (p->private)
+ ret = true;
+ else
+ __bch_keybuf_del(buf, p);
+ }
+
+ spin_unlock(&buf->lock);
+ return ret;
+}
+
+struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
+{
+ struct keybuf_key *w;
+ spin_lock(&buf->lock);
+
+ w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+
+ while (w && w->private)
+ w = RB_NEXT(w, node);
+
+ if (w)
+ w->private = ERR_PTR(-EINTR);
+
+ spin_unlock(&buf->lock);
+ return w;
+}
+
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
+ struct keybuf *buf,
+ struct bkey *end,
+ keybuf_pred_fn *pred)
+{
+ struct keybuf_key *ret;
+
+ while (1) {
+ ret = bch_keybuf_next(buf);
+ if (ret)
+ break;
+
+ if (bkey_cmp(&buf->last_scanned, end) >= 0) {
+ pr_debug("scan finished");
+ break;
+ }
+
+ bch_refill_keybuf(c, buf, end, pred);
+ }
+
+ return ret;
+}
+
+void bch_keybuf_init(struct keybuf *buf)
+{
+ buf->last_scanned = MAX_KEY;
+ buf->keys = RB_ROOT;
+
+ spin_lock_init(&buf->lock);
+ array_allocator_init(&buf->freelist);
+}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
new file mode 100644
index 000000000..5c391fa01
--- /dev/null
+++ b/drivers/md/bcache/btree.h
@@ -0,0 +1,310 @@
+#ifndef _BCACHE_BTREE_H
+#define _BCACHE_BTREE_H
+
+/*
+ * THE BTREE:
+ *
+ * At a high level, bcache's btree is relatively standard b+ tree. All keys and
+ * pointers are in the leaves; interior nodes only have pointers to the child
+ * nodes.
+ *
+ * In the interior nodes, a struct bkey always points to a child btree node, and
+ * the key is the highest key in the child node - except that the highest key in
+ * an interior node is always MAX_KEY. The size field refers to the size on disk
+ * of the child node - this would allow us to have variable sized btree nodes
+ * (handy for keeping the depth of the btree 1 by expanding just the root).
+ *
+ * Btree nodes are themselves log structured, but this is hidden fairly
+ * thoroughly. Btree nodes on disk will in practice have extents that overlap
+ * (because they were written at different times), but in memory we never have
+ * overlapping extents - when we read in a btree node from disk, the first thing
+ * we do is resort all the sets of keys with a mergesort, and in the same pass
+ * we check for overlapping extents and adjust them appropriately.
+ *
+ * struct btree_op is a central interface to the btree code. It's used for
+ * specifying read vs. write locking, and the embedded closure is used for
+ * waiting on IO or reserve memory.
+ *
+ * BTREE CACHE:
+ *
+ * Btree nodes are cached in memory; traversing the btree might require reading
+ * in btree nodes which is handled mostly transparently.
+ *
+ * bch_btree_node_get() looks up a btree node in the cache and reads it in from
+ * disk if necessary. This function is almost never called directly though - the
+ * btree() macro is used to get a btree node, call some function on it, and
+ * unlock the node after the function returns.
+ *
+ * The root is special cased - it's taken out of the cache's lru (thus pinning
+ * it in memory), so we can find the root of the btree by just dereferencing a
+ * pointer instead of looking it up in the cache. This makes locking a bit
+ * tricky, since the root pointer is protected by the lock in the btree node it
+ * points to - the btree_root() macro handles this.
+ *
+ * In various places we must be able to allocate memory for multiple btree nodes
+ * in order to make forward progress. To do this we use the btree cache itself
+ * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
+ * cache we can reuse. We can't allow more than one thread to be doing this at a
+ * time, so there's a lock, implemented by a pointer to the btree_op closure -
+ * this allows the btree_root() macro to implicitly release this lock.
+ *
+ * BTREE IO:
+ *
+ * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
+ * this.
+ *
+ * For writing, we have two btree_write structs embeddded in struct btree - one
+ * write in flight, and one being set up, and we toggle between them.
+ *
+ * Writing is done with a single function - bch_btree_write() really serves two
+ * different purposes and should be broken up into two different functions. When
+ * passing now = false, it merely indicates that the node is now dirty - calling
+ * it ensures that the dirty keys will be written at some point in the future.
+ *
+ * When passing now = true, bch_btree_write() causes a write to happen
+ * "immediately" (if there was already a write in flight, it'll cause the write
+ * to happen as soon as the previous write completes). It returns immediately
+ * though - but it takes a refcount on the closure in struct btree_op you passed
+ * to it, so a closure_sync() later can be used to wait for the write to
+ * complete.
+ *
+ * This is handy because btree_split() and garbage collection can issue writes
+ * in parallel, reducing the amount of time they have to hold write locks.
+ *
+ * LOCKING:
+ *
+ * When traversing the btree, we may need write locks starting at some level -
+ * inserting a key into the btree will typically only require a write lock on
+ * the leaf node.
+ *
+ * This is specified with the lock field in struct btree_op; lock = 0 means we
+ * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
+ * checks this field and returns the node with the appropriate lock held.
+ *
+ * If, after traversing the btree, the insertion code discovers it has to split
+ * then it must restart from the root and take new locks - to do this it changes
+ * the lock field and returns -EINTR, which causes the btree_root() macro to
+ * loop.
+ *
+ * Handling cache misses require a different mechanism for upgrading to a write
+ * lock. We do cache lookups with only a read lock held, but if we get a cache
+ * miss and we wish to insert this data into the cache, we have to insert a
+ * placeholder key to detect races - otherwise, we could race with a write and
+ * overwrite the data that was just written to the cache with stale data from
+ * the backing device.
+ *
+ * For this we use a sequence number that write locks and unlocks increment - to
+ * insert the check key it unlocks the btree node and then takes a write lock,
+ * and fails if the sequence number doesn't match.
+ */
+
+#include "bset.h"
+#include "debug.h"
+
+struct btree_write {
+ atomic_t *journal;
+
+ /* If btree_split() frees a btree node, it writes a new pointer to that
+ * btree node indicating it was freed; it takes a refcount on
+ * c->prio_blocked because we can't write the gens until the new
+ * pointer is on disk. This allows btree_write_endio() to release the
+ * refcount that btree_split() took.
+ */
+ int prio_blocked;
+};
+
+struct btree {
+ /* Hottest entries first */
+ struct hlist_node hash;
+
+ /* Key/pointer for this btree node */
+ BKEY_PADDED(key);
+
+ /* Single bit - set when accessed, cleared by shrinker */
+ unsigned long accessed;
+ unsigned long seq;
+ struct rw_semaphore lock;
+ struct cache_set *c;
+ struct btree *parent;
+
+ struct mutex write_lock;
+
+ unsigned long flags;
+ uint16_t written; /* would be nice to kill */
+ uint8_t level;
+
+ struct btree_keys keys;
+
+ /* For outstanding btree writes, used as a lock - protects write_idx */
+ struct closure io;
+ struct semaphore io_mutex;
+
+ struct list_head list;
+ struct delayed_work work;
+
+ struct btree_write writes[2];
+ struct bio *bio;
+};
+
+#define BTREE_FLAG(flag) \
+static inline bool btree_node_ ## flag(struct btree *b) \
+{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
+ \
+static inline void set_btree_node_ ## flag(struct btree *b) \
+{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
+
+enum btree_flags {
+ BTREE_NODE_io_error,
+ BTREE_NODE_dirty,
+ BTREE_NODE_write_idx,
+};
+
+BTREE_FLAG(io_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(write_idx);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+ return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+ return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+ return b->keys.set->data;
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+ return bset_tree_last(&b->keys)->data;
+}
+
+static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
+{
+ return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
+}
+
+static inline void set_gc_sectors(struct cache_set *c)
+{
+ atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
+}
+
+void bkey_put(struct cache_set *c, struct bkey *k);
+
+/* Looping macros */
+
+#define for_each_cached_btree(b, c, iter) \
+ for (iter = 0; \
+ iter < ARRAY_SIZE((c)->bucket_hash); \
+ iter++) \
+ hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
+
+/* Recursing down the btree */
+
+struct btree_op {
+ /* for waiting on btree reserve in btree_split() */
+ wait_queue_t wait;
+
+ /* Btree level at which we start taking write locks */
+ short lock;
+
+ unsigned insert_collision:1;
+};
+
+static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
+{
+ memset(op, 0, sizeof(struct btree_op));
+ init_wait(&op->wait);
+ op->lock = write_lock_level;
+}
+
+static inline void rw_lock(bool w, struct btree *b, int level)
+{
+ w ? down_write_nested(&b->lock, level + 1)
+ : down_read_nested(&b->lock, level + 1);
+ if (w)
+ b->seq++;
+}
+
+static inline void rw_unlock(bool w, struct btree *b)
+{
+ if (w)
+ b->seq++;
+ (w ? up_write : up_read)(&b->lock);
+}
+
+void bch_btree_node_read_done(struct btree *);
+void __bch_btree_node_write(struct btree *, struct closure *);
+void bch_btree_node_write(struct btree *, struct closure *);
+
+void bch_btree_set_root(struct btree *);
+struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *,
+ int, bool, struct btree *);
+struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
+ struct bkey *, int, bool, struct btree *);
+
+int bch_btree_insert_check_key(struct btree *, struct btree_op *,
+ struct bkey *);
+int bch_btree_insert(struct cache_set *, struct keylist *,
+ atomic_t *, struct bkey *);
+
+int bch_gc_thread_start(struct cache_set *);
+void bch_initial_gc_finish(struct cache_set *);
+void bch_moving_gc(struct cache_set *);
+int bch_btree_check(struct cache_set *);
+void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
+
+static inline void wake_up_gc(struct cache_set *c)
+{
+ if (c->gc_thread)
+ wake_up_process(c->gc_thread);
+}
+
+#define MAP_DONE 0
+#define MAP_CONTINUE 1
+
+#define MAP_ALL_NODES 0
+#define MAP_LEAF_NODES 1
+
+#define MAP_END_KEY 1
+
+typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *);
+int __bch_btree_map_nodes(struct btree_op *, struct cache_set *,
+ struct bkey *, btree_map_nodes_fn *, int);
+
+static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
+ struct bkey *from, btree_map_nodes_fn *fn)
+{
+ return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
+}
+
+static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
+ struct cache_set *c,
+ struct bkey *from,
+ btree_map_nodes_fn *fn)
+{
+ return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
+}
+
+typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *,
+ struct bkey *);
+int bch_btree_map_keys(struct btree_op *, struct cache_set *,
+ struct bkey *, btree_map_keys_fn *, int);
+
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
+
+void bch_keybuf_init(struct keybuf *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *,
+ struct bkey *, keybuf_pred_fn *);
+bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
+ struct bkey *);
+void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
+struct keybuf_key *bch_keybuf_next(struct keybuf *);
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
+ struct bkey *, keybuf_pred_fn *);
+
+#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
new file mode 100644
index 000000000..7a228de95
--- /dev/null
+++ b/drivers/md/bcache/closure.c
@@ -0,0 +1,222 @@
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include "closure.h"
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+ int r = flags & CLOSURE_REMAINING_MASK;
+
+ BUG_ON(flags & CLOSURE_GUARD_MASK);
+ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
+
+ /* Must deliver precisely one wakeup */
+ if (r == 1 && (flags & CLOSURE_SLEEPING))
+ wake_up_process(cl->task);
+
+ if (!r) {
+ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+ atomic_set(&cl->remaining,
+ CLOSURE_REMAINING_INITIALIZER);
+ closure_queue(cl);
+ } else {
+ struct closure *parent = cl->parent;
+ closure_fn *destructor = cl->fn;
+
+ closure_debug_destroy(cl);
+
+ if (destructor)
+ destructor(cl);
+
+ if (parent)
+ closure_put(parent);
+ }
+ }
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL(closure_sub);
+
+/**
+ * closure_put - decrement a closure's refcount
+ */
+void closure_put(struct closure *cl)
+{
+ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL(closure_put);
+
+/**
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+ struct llist_node *list;
+ struct closure *cl;
+ struct llist_node *reverse = NULL;
+
+ list = llist_del_all(&wait_list->list);
+
+ /* We first reverse the list to preserve FIFO ordering and fairness */
+
+ while (list) {
+ struct llist_node *t = list;
+ list = llist_next(list);
+
+ t->next = reverse;
+ reverse = t;
+ }
+
+ /* Then do the wakeups */
+
+ while (reverse) {
+ cl = container_of(reverse, struct closure, list);
+ reverse = llist_next(reverse);
+
+ closure_set_waiting(cl, 0);
+ closure_sub(cl, CLOSURE_WAITING + 1);
+ }
+}
+EXPORT_SYMBOL(__closure_wake_up);
+
+/**
+ * closure_wait - add a closure to a waitlist
+ *
+ * @waitlist will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+ return false;
+
+ closure_set_waiting(cl, _RET_IP_);
+ atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+ llist_add(&cl->list, &waitlist->list);
+
+ return true;
+}
+EXPORT_SYMBOL(closure_wait);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+void closure_sync(struct closure *cl)
+{
+ while (1) {
+ __closure_start_sleep(cl);
+ closure_set_ret_ip(cl);
+
+ if ((atomic_read(&cl->remaining) &
+ CLOSURE_REMAINING_MASK) == 1)
+ break;
+
+ schedule();
+ }
+
+ __closure_end_sleep(cl);
+}
+EXPORT_SYMBOL(closure_sync);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+ unsigned long flags;
+
+ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+ cl->magic = CLOSURE_MAGIC_ALIVE;
+
+ spin_lock_irqsave(&closure_list_lock, flags);
+ list_add(&cl->all, &closure_list);
+ spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+ unsigned long flags;
+
+ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+ cl->magic = CLOSURE_MAGIC_DEAD;
+
+ spin_lock_irqsave(&closure_list_lock, flags);
+ list_del(&cl->all);
+ spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_destroy);
+
+static struct dentry *debug;
+
+#define work_data_bits(work) ((unsigned long *)(&(work)->data))
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+ struct closure *cl;
+ spin_lock_irq(&closure_list_lock);
+
+ list_for_each_entry(cl, &closure_list, all) {
+ int r = atomic_read(&cl->remaining);
+
+ seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+ cl, (void *) cl->ip, cl->fn, cl->parent,
+ r & CLOSURE_REMAINING_MASK);
+
+ seq_printf(f, "%s%s%s%s\n",
+ test_bit(WORK_STRUCT_PENDING,
+ work_data_bits(&cl->work)) ? "Q" : "",
+ r & CLOSURE_RUNNING ? "R" : "",
+ r & CLOSURE_STACK ? "S" : "",
+ r & CLOSURE_SLEEPING ? "Sl" : "");
+
+ if (r & CLOSURE_WAITING)
+ seq_printf(f, " W %pF\n",
+ (void *) cl->waiting_on);
+
+ seq_printf(f, "\n");
+ }
+
+ spin_unlock_irq(&closure_list_lock);
+ return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, debug_seq_show, NULL);
+}
+
+static const struct file_operations debug_ops = {
+ .owner = THIS_MODULE,
+ .open = debug_seq_open,
+ .read = seq_read,
+ .release = single_release
+};
+
+void __init closure_debug_init(void)
+{
+ debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+}
+
+#endif
+
+MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
new file mode 100644
index 000000000..a08e3eeac
--- /dev/null
+++ b/drivers/md/bcache/closure.h
@@ -0,0 +1,386 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ * continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, is a macro that returns the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio, int error)
+ * {
+ * closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+typedef void (closure_fn) (struct closure *);
+
+struct closure_waitlist {
+ struct llist_head list;
+};
+
+enum closure_state {
+ /*
+ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+ * the thread that owns the closure, and cleared by the thread that's
+ * waking up the closure.
+ *
+ * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
+ * - indicates that cl->task is valid and closure_put() may wake it up.
+ * Only set or cleared by the thread that owns the closure.
+ *
+ * The rest are for debugging and don't affect behaviour:
+ *
+ * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+ * closure_init() and when closure_put() runs then next function), and
+ * must be cleared before remaining hits 0. Primarily to help guard
+ * against incorrect usage and accidentally transferring references.
+ * continue_at() and closure_return() clear it for you, if you're doing
+ * something unusual you can use closure_set_dead() which also helps
+ * annotate where references are being transferred.
+ *
+ * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
+ * closure with this flag set
+ */
+
+ CLOSURE_BITS_START = (1 << 23),
+ CLOSURE_DESTRUCTOR = (1 << 23),
+ CLOSURE_WAITING = (1 << 25),
+ CLOSURE_SLEEPING = (1 << 27),
+ CLOSURE_RUNNING = (1 << 29),
+ CLOSURE_STACK = (1 << 31),
+};
+
+#define CLOSURE_GUARD_MASK \
+ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \
+ CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+
+#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
+
+struct closure {
+ union {
+ struct {
+ struct workqueue_struct *wq;
+ struct task_struct *task;
+ struct llist_node list;
+ closure_fn *fn;
+ };
+ struct work_struct work;
+ };
+
+ struct closure *parent;
+
+ atomic_t remaining;
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#define CLOSURE_MAGIC_DEAD 0xc054dead
+#define CLOSURE_MAGIC_ALIVE 0xc054a11e
+
+ unsigned magic;
+ struct list_head all;
+ unsigned long ip;
+ unsigned long waiting_on;
+#endif
+};
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void closure_sync(struct closure *cl);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+void closure_debug_init(void);
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_init(void) {}
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+ cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+ cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+ cl->waiting_on = f;
+#endif
+}
+
+static inline void __closure_end_sleep(struct closure *cl)
+{
+ __set_current_state(TASK_RUNNING);
+
+ if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+ atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+static inline void __closure_start_sleep(struct closure *cl)
+{
+ closure_set_ip(cl);
+ cl->task = current;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+ atomic_add(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+ atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+ struct workqueue_struct *wq)
+{
+ BUG_ON(object_is_on_stack(cl));
+ closure_set_ip(cl);
+ cl->fn = fn;
+ cl->wq = wq;
+ /* between atomic_dec() in closure_put() */
+ smp_mb__before_atomic();
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+ struct workqueue_struct *wq = cl->wq;
+ if (wq) {
+ INIT_WORK(&cl->work, cl->work.func);
+ BUG_ON(!queue_work(wq, &cl->work));
+ } else
+ cl->fn(cl);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+ BUG_ON((atomic_inc_return(&cl->remaining) &
+ CLOSURE_REMAINING_MASK) <= 1);
+#else
+ atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl: closure to initialize
+ * @parent: parent of the new closure. cl will take a refcount on it for its
+ * lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+ memset(cl, 0, sizeof(struct closure));
+ cl->parent = parent;
+ if (parent)
+ closure_get(parent);
+
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+ closure_debug_create(cl);
+ closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+ memset(cl, 0, sizeof(struct closure));
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+ smp_mb();
+ __closure_wake_up(list);
+}
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * NOTE: This macro expands to a return in the calling function!
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ */
+#define continue_at(_cl, _fn, _wq) \
+do { \
+ set_closure_fn(_cl, _fn, _wq); \
+ closure_sub(_cl, CLOSURE_RUNNING + 1); \
+ return; \
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl) continue_at((_cl), NULL, NULL)
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq) \
+do { \
+ set_closure_fn(_cl, _fn, _wq); \
+ closure_queue(_cl); \
+ return; \
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor) \
+do { \
+ set_closure_fn(_cl, _destructor, NULL); \
+ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
+ return; \
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+ struct workqueue_struct *wq,
+ struct closure *parent)
+{
+ closure_init(cl, parent);
+ continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
new file mode 100644
index 000000000..8b1f1d5c1
--- /dev/null
+++ b/drivers/md/bcache/debug.c
@@ -0,0 +1,252 @@
+/*
+ * Assorted bcache debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *debug;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+#define for_each_written_bset(b, start, i) \
+ for (i = (start); \
+ (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
+ i->seq == (start)->seq; \
+ i = (void *) i + set_blocks(i, block_bytes(b->c)) * \
+ block_bytes(b->c))
+
+void bch_btree_verify(struct btree *b)
+{
+ struct btree *v = b->c->verify_data;
+ struct bset *ondisk, *sorted, *inmemory;
+ struct bio *bio;
+
+ if (!b->c->verify || !b->c->verify_ondisk)
+ return;
+
+ down(&b->io_mutex);
+ mutex_lock(&b->c->verify_lock);
+
+ ondisk = b->c->verify_ondisk;
+ sorted = b->c->verify_data->keys.set->data;
+ inmemory = b->keys.set->data;
+
+ bkey_copy(&v->key, &b->key);
+ v->written = 0;
+ v->level = b->level;
+ v->keys.ops = b->keys.ops;
+
+ bio = bch_bbio_alloc(b->c);
+ bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev;
+ bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
+ bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9;
+ bch_bio_map(bio, sorted);
+
+ submit_bio_wait(REQ_META|READ_SYNC, bio);
+ bch_bbio_free(bio, b->c);
+
+ memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
+
+ bch_btree_node_read_done(v);
+ sorted = v->keys.set->data;
+
+ if (inmemory->keys != sorted->keys ||
+ memcmp(inmemory->start,
+ sorted->start,
+ (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+ struct bset *i;
+ unsigned j;
+
+ console_lock();
+
+ printk(KERN_ERR "*** in memory:\n");
+ bch_dump_bset(&b->keys, inmemory, 0);
+
+ printk(KERN_ERR "*** read back in:\n");
+ bch_dump_bset(&v->keys, sorted, 0);
+
+ for_each_written_bset(b, ondisk, i) {
+ unsigned block = ((void *) i - (void *) ondisk) /
+ block_bytes(b->c);
+
+ printk(KERN_ERR "*** on disk block %u:\n", block);
+ bch_dump_bset(&b->keys, i, block);
+ }
+
+ printk(KERN_ERR "*** block %zu not written\n",
+ ((void *) i - (void *) ondisk) / block_bytes(b->c));
+
+ for (j = 0; j < inmemory->keys; j++)
+ if (inmemory->d[j] != sorted->d[j])
+ break;
+
+ printk(KERN_ERR "b->written %u\n", b->written);
+
+ console_unlock();
+ panic("verify failed at %u\n", j);
+ }
+
+ mutex_unlock(&b->c->verify_lock);
+ up(&b->io_mutex);
+}
+
+void bch_data_verify(struct cached_dev *dc, struct bio *bio)
+{
+ char name[BDEVNAME_SIZE];
+ struct bio *check;
+ struct bio_vec bv, *bv2;
+ struct bvec_iter iter;
+ int i;
+
+ check = bio_clone(bio, GFP_NOIO);
+ if (!check)
+ return;
+
+ if (bio_alloc_pages(check, GFP_NOIO))
+ goto out_put;
+
+ submit_bio_wait(READ_SYNC, check);
+
+ bio_for_each_segment(bv, bio, iter) {
+ void *p1 = kmap_atomic(bv.bv_page);
+ void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
+
+ cache_set_err_on(memcmp(p1 + bv.bv_offset,
+ p2 + bv.bv_offset,
+ bv.bv_len),
+ dc->disk.c,
+ "verify failed at dev %s sector %llu",
+ bdevname(dc->bdev, name),
+ (uint64_t) bio->bi_iter.bi_sector);
+
+ kunmap_atomic(p1);
+ }
+
+ bio_for_each_segment_all(bv2, check, i)
+ __free_page(bv2->bv_page);
+out_put:
+ bio_put(check);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: cache set refcounting */
+
+struct dump_iterator {
+ char buf[PAGE_SIZE];
+ size_t bytes;
+ struct cache_set *c;
+ struct keybuf keys;
+};
+
+static bool dump_pred(struct keybuf *buf, struct bkey *k)
+{
+ return true;
+}
+
+static ssize_t bch_dump_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iterator *i = file->private_data;
+ ssize_t ret = 0;
+ char kbuf[80];
+
+ while (size) {
+ struct keybuf_key *w;
+ unsigned bytes = min(i->bytes, size);
+
+ int err = copy_to_user(buf, i->buf, bytes);
+ if (err)
+ return err;
+
+ ret += bytes;
+ buf += bytes;
+ size -= bytes;
+ i->bytes -= bytes;
+ memmove(i->buf, i->buf + bytes, i->bytes);
+
+ if (i->bytes)
+ break;
+
+ w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
+ if (!w)
+ break;
+
+ bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
+ i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
+ bch_keybuf_del(&i->keys, w);
+ }
+
+ return ret;
+}
+
+static int bch_dump_open(struct inode *inode, struct file *file)
+{
+ struct cache_set *c = inode->i_private;
+ struct dump_iterator *i;
+
+ i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
+ if (!i)
+ return -ENOMEM;
+
+ file->private_data = i;
+ i->c = c;
+ bch_keybuf_init(&i->keys);
+ i->keys.last_scanned = KEY(0, 0, 0);
+
+ return 0;
+}
+
+static int bch_dump_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static const struct file_operations cache_set_debug_ops = {
+ .owner = THIS_MODULE,
+ .open = bch_dump_open,
+ .read = bch_dump_read,
+ .release = bch_dump_release
+};
+
+void bch_debug_init_cache_set(struct cache_set *c)
+{
+ if (!IS_ERR_OR_NULL(debug)) {
+ char name[50];
+ snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
+
+ c->debug = debugfs_create_file(name, 0400, debug, c,
+ &cache_set_debug_ops);
+ }
+}
+
+#endif
+
+void bch_debug_exit(void)
+{
+ if (!IS_ERR_OR_NULL(debug))
+ debugfs_remove_recursive(debug);
+}
+
+int __init bch_debug_init(struct kobject *kobj)
+{
+ int ret = 0;
+
+ debug = debugfs_create_dir("bcache", NULL);
+ return ret;
+}
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
new file mode 100644
index 000000000..1f63c195d
--- /dev/null
+++ b/drivers/md/bcache/debug.h
@@ -0,0 +1,34 @@
+#ifndef _BCACHE_DEBUG_H
+#define _BCACHE_DEBUG_H
+
+struct bio;
+struct cached_dev;
+struct cache_set;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_btree_verify(struct btree *);
+void bch_data_verify(struct cached_dev *, struct bio *);
+
+#define expensive_debug_checks(c) ((c)->expensive_debug_checks)
+#define key_merging_disabled(c) ((c)->key_merging_disabled)
+#define bypass_torture_test(d) ((d)->bypass_torture_test)
+
+#else /* DEBUG */
+
+static inline void bch_btree_verify(struct btree *b) {}
+static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
+
+#define expensive_debug_checks(c) 0
+#define key_merging_disabled(c) 0
+#define bypass_torture_test(d) 0
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+void bch_debug_init_cache_set(struct cache_set *);
+#else
+static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+#endif
+
+#endif
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
new file mode 100644
index 000000000..243de0bf1
--- /dev/null
+++ b/drivers/md/bcache/extents.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "writeback.h"
+
+static void sort_key_next(struct btree_iter *iter,
+ struct btree_iter_set *i)
+{
+ i->k = bkey_next(i->k);
+
+ if (i->k == i->end)
+ *i = iter->data[--iter->used];
+}
+
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
+{
+ int64_t c = bkey_cmp(l.k, r.k);
+
+ return c ? c > 0 : l.k < r.k;
+}
+
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i)) {
+ struct cache *ca = PTR_CACHE(c, k, i);
+ size_t bucket = PTR_BUCKET_NR(c, k, i);
+ size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+ if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+ bucket < ca->sb.first_bucket ||
+ bucket >= ca->sb.nbuckets)
+ return true;
+ }
+
+ return false;
+}
+
+/* Common among btree and extent ptrs */
+
+static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i)) {
+ struct cache *ca = PTR_CACHE(c, k, i);
+ size_t bucket = PTR_BUCKET_NR(c, k, i);
+ size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+ if (KEY_SIZE(k) + r > c->sb.bucket_size)
+ return "bad, length too big";
+ if (bucket < ca->sb.first_bucket)
+ return "bad, short offset";
+ if (bucket >= ca->sb.nbuckets)
+ return "bad, offset past end of device";
+ if (ptr_stale(c, k, i))
+ return "stale";
+ }
+
+ if (!bkey_cmp(k, &ZERO_KEY))
+ return "bad, null key";
+ if (!KEY_PTRS(k))
+ return "bad, no pointers";
+ if (!KEY_SIZE(k))
+ return "zeroed key";
+ return "";
+}
+
+void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
+{
+ unsigned i = 0;
+ char *out = buf, *end = buf + size;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+ p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
+
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ if (i)
+ p(", ");
+
+ if (PTR_DEV(k, i) == PTR_CHECK_DEV)
+ p("check dev");
+ else
+ p("%llu:%llu gen %llu", PTR_DEV(k, i),
+ PTR_OFFSET(k, i), PTR_GEN(k, i));
+ }
+
+ p("]");
+
+ if (KEY_DIRTY(k))
+ p(" dirty");
+ if (KEY_CSUM(k))
+ p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+}
+
+static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
+{
+ struct btree *b = container_of(keys, struct btree, keys);
+ unsigned j;
+ char buf[80];
+
+ bch_extent_to_text(buf, sizeof(buf), k);
+ printk(" %s", buf);
+
+ for (j = 0; j < KEY_PTRS(k); j++) {
+ size_t n = PTR_BUCKET_NR(b->c, k, j);
+ printk(" bucket %zu", n);
+
+ if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+ printk(" prio %i",
+ PTR_BUCKET(b->c, k, j)->prio);
+ }
+
+ printk(" %s\n", bch_ptr_status(b->c, k));
+}
+
+/* Btree ptrs */
+
+bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+ char buf[80];
+
+ if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+ goto bad;
+
+ if (__ptr_invalid(c, k))
+ goto bad;
+
+ return false;
+bad:
+ bch_extent_to_text(buf, sizeof(buf), k);
+ cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+ return true;
+}
+
+static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ return __bch_btree_ptr_invalid(b->c, k);
+}
+
+static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
+{
+ unsigned i;
+ char buf[80];
+ struct bucket *g;
+
+ if (mutex_trylock(&b->c->bucket_lock)) {
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(b->c, k, i)) {
+ g = PTR_BUCKET(b->c, k, i);
+
+ if (KEY_DIRTY(k) ||
+ g->prio != BTREE_PRIO ||
+ (b->c->gc_mark_valid &&
+ GC_MARK(g) != GC_MARK_METADATA))
+ goto err;
+ }
+
+ mutex_unlock(&b->c->bucket_lock);
+ }
+
+ return false;
+err:
+ mutex_unlock(&b->c->bucket_lock);
+ bch_extent_to_text(buf, sizeof(buf), k);
+ btree_bug(b,
+"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
+ buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+ g->prio, g->gen, g->last_gc, GC_MARK(g));
+ return true;
+}
+
+static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ unsigned i;
+
+ if (!bkey_cmp(k, &ZERO_KEY) ||
+ !KEY_PTRS(k) ||
+ bch_ptr_invalid(bk, k))
+ return true;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (!ptr_available(b->c, k, i) ||
+ ptr_stale(b->c, k, i))
+ return true;
+
+ if (expensive_debug_checks(b->c) &&
+ btree_ptr_bad_expensive(b, k))
+ return true;
+
+ return false;
+}
+
+static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
+ struct bkey *insert,
+ struct btree_iter *iter,
+ struct bkey *replace_key)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+
+ if (!KEY_OFFSET(insert))
+ btree_current_write(b)->prio_blocked++;
+
+ return false;
+}
+
+const struct btree_keys_ops bch_btree_keys_ops = {
+ .sort_cmp = bch_key_sort_cmp,
+ .insert_fixup = bch_btree_ptr_insert_fixup,
+ .key_invalid = bch_btree_ptr_invalid,
+ .key_bad = bch_btree_ptr_bad,
+ .key_to_text = bch_extent_to_text,
+ .key_dump = bch_bkey_dump,
+};
+
+/* Extents */
+
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
+{
+ int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+ return c ? c > 0 : l.k < r.k;
+}
+
+static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+ struct bkey *tmp)
+{
+ while (iter->used > 1) {
+ struct btree_iter_set *top = iter->data, *i = top + 1;
+
+ if (iter->used > 2 &&
+ bch_extent_sort_cmp(i[0], i[1]))
+ i++;
+
+ if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+ break;
+
+ if (!KEY_SIZE(i->k)) {
+ sort_key_next(iter, i);
+ heap_sift(iter, i - top, bch_extent_sort_cmp);
+ continue;
+ }
+
+ if (top->k > i->k) {
+ if (bkey_cmp(top->k, i->k) >= 0)
+ sort_key_next(iter, i);
+ else
+ bch_cut_front(top->k, i->k);
+
+ heap_sift(iter, i - top, bch_extent_sort_cmp);
+ } else {
+ /* can't happen because of comparison func */
+ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+
+ if (bkey_cmp(i->k, top->k) < 0) {
+ bkey_copy(tmp, top->k);
+
+ bch_cut_back(&START_KEY(i->k), tmp);
+ bch_cut_front(i->k, top->k);
+ heap_sift(iter, 0, bch_extent_sort_cmp);
+
+ return tmp;
+ } else {
+ bch_cut_back(&START_KEY(i->k), top->k);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void bch_subtract_dirty(struct bkey *k,
+ struct cache_set *c,
+ uint64_t offset,
+ int sectors)
+{
+ if (KEY_DIRTY(k))
+ bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+ offset, -sectors);
+}
+
+static bool bch_extent_insert_fixup(struct btree_keys *b,
+ struct bkey *insert,
+ struct btree_iter *iter,
+ struct bkey *replace_key)
+{
+ struct cache_set *c = container_of(b, struct btree, keys)->c;
+
+ uint64_t old_offset;
+ unsigned old_size, sectors_found = 0;
+
+ BUG_ON(!KEY_OFFSET(insert));
+ BUG_ON(!KEY_SIZE(insert));
+
+ while (1) {
+ struct bkey *k = bch_btree_iter_next(iter);
+ if (!k)
+ break;
+
+ if (bkey_cmp(&START_KEY(k), insert) >= 0) {
+ if (KEY_SIZE(k))
+ break;
+ else
+ continue;
+ }
+
+ if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+ continue;
+
+ old_offset = KEY_START(k);
+ old_size = KEY_SIZE(k);
+
+ /*
+ * We might overlap with 0 size extents; we can't skip these
+ * because if they're in the set we're inserting to we have to
+ * adjust them so they don't overlap with the key we're
+ * inserting. But we don't want to check them for replace
+ * operations.
+ */
+
+ if (replace_key && KEY_SIZE(k)) {
+ /*
+ * k might have been split since we inserted/found the
+ * key we're replacing
+ */
+ unsigned i;
+ uint64_t offset = KEY_START(k) -
+ KEY_START(replace_key);
+
+ /* But it must be a subset of the replace key */
+ if (KEY_START(k) < KEY_START(replace_key) ||
+ KEY_OFFSET(k) > KEY_OFFSET(replace_key))
+ goto check_failed;
+
+ /* We didn't find a key that we were supposed to */
+ if (KEY_START(k) > KEY_START(insert) + sectors_found)
+ goto check_failed;
+
+ if (!bch_bkey_equal_header(k, replace_key))
+ goto check_failed;
+
+ /* skip past gen */
+ offset <<= 8;
+
+ BUG_ON(!KEY_PTRS(replace_key));
+
+ for (i = 0; i < KEY_PTRS(replace_key); i++)
+ if (k->ptr[i] != replace_key->ptr[i] + offset)
+ goto check_failed;
+
+ sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+ }
+
+ if (bkey_cmp(insert, k) < 0 &&
+ bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+ /*
+ * We overlapped in the middle of an existing key: that
+ * means we have to split the old key. But we have to do
+ * slightly different things depending on whether the
+ * old key has been written out yet.
+ */
+
+ struct bkey *top;
+
+ bch_subtract_dirty(k, c, KEY_START(insert),
+ KEY_SIZE(insert));
+
+ if (bkey_written(b, k)) {
+ /*
+ * We insert a new key to cover the top of the
+ * old key, and the old key is modified in place
+ * to represent the bottom split.
+ *
+ * It's completely arbitrary whether the new key
+ * is the top or the bottom, but it has to match
+ * up with what btree_sort_fixup() does - it
+ * doesn't check for this kind of overlap, it
+ * depends on us inserting a new key for the top
+ * here.
+ */
+ top = bch_bset_search(b, bset_tree_last(b),
+ insert);
+ bch_bset_insert(b, top, k);
+ } else {
+ BKEY_PADDED(key) temp;
+ bkey_copy(&temp.key, k);
+ bch_bset_insert(b, k, &temp.key);
+ top = bkey_next(k);
+ }
+
+ bch_cut_front(insert, top);
+ bch_cut_back(&START_KEY(insert), k);
+ bch_bset_fix_invalidated_key(b, k);
+ goto out;
+ }
+
+ if (bkey_cmp(insert, k) < 0) {
+ bch_cut_front(insert, k);
+ } else {
+ if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+ old_offset = KEY_START(insert);
+
+ if (bkey_written(b, k) &&
+ bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+ /*
+ * Completely overwrote, so we don't have to
+ * invalidate the binary search tree
+ */
+ bch_cut_front(k, k);
+ } else {
+ __bch_cut_back(&START_KEY(insert), k);
+ bch_bset_fix_invalidated_key(b, k);
+ }
+ }
+
+ bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
+ }
+
+check_failed:
+ if (replace_key) {
+ if (!sectors_found) {
+ return true;
+ } else if (sectors_found < KEY_SIZE(insert)) {
+ SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+ (KEY_SIZE(insert) - sectors_found));
+ SET_KEY_SIZE(insert, sectors_found);
+ }
+ }
+out:
+ if (KEY_DIRTY(insert))
+ bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
+ KEY_START(insert),
+ KEY_SIZE(insert));
+
+ return false;
+}
+
+bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
+{
+ char buf[80];
+
+ if (!KEY_SIZE(k))
+ return true;
+
+ if (KEY_SIZE(k) > KEY_OFFSET(k))
+ goto bad;
+
+ if (__ptr_invalid(c, k))
+ goto bad;
+
+ return false;
+bad:
+ bch_extent_to_text(buf, sizeof(buf), k);
+ cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
+ return true;
+}
+
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ return __bch_extent_invalid(b->c, k);
+}
+
+static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
+ unsigned ptr)
+{
+ struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+ char buf[80];
+
+ if (mutex_trylock(&b->c->bucket_lock)) {
+ if (b->c->gc_mark_valid &&
+ (!GC_MARK(g) ||
+ GC_MARK(g) == GC_MARK_METADATA ||
+ (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
+ goto err;
+
+ if (g->prio == BTREE_PRIO)
+ goto err;
+
+ mutex_unlock(&b->c->bucket_lock);
+ }
+
+ return false;
+err:
+ mutex_unlock(&b->c->bucket_lock);
+ bch_extent_to_text(buf, sizeof(buf), k);
+ btree_bug(b,
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
+ buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+ g->prio, g->gen, g->last_gc, GC_MARK(g));
+ return true;
+}
+
+static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ struct bucket *g;
+ unsigned i, stale;
+
+ if (!KEY_PTRS(k) ||
+ bch_extent_invalid(bk, k))
+ return true;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (!ptr_available(b->c, k, i))
+ return true;
+
+ if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
+ return false;
+
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ g = PTR_BUCKET(b->c, k, i);
+ stale = ptr_stale(b->c, k, i);
+
+ btree_bug_on(stale > 96, b,
+ "key too stale: %i, need_gc %u",
+ stale, b->c->need_gc);
+
+ btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+ b, "stale dirty pointer");
+
+ if (stale)
+ return true;
+
+ if (expensive_debug_checks(b->c) &&
+ bch_extent_bad_expensive(b, k, i))
+ return true;
+ }
+
+ return false;
+}
+
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+ return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+ ~((uint64_t)1 << 63);
+}
+
+static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
+{
+ struct btree *b = container_of(bk, struct btree, keys);
+ unsigned i;
+
+ if (key_merging_disabled(b->c))
+ return false;
+
+ for (i = 0; i < KEY_PTRS(l); i++)
+ if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+ PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+ return false;
+
+ /* Keys with no pointers aren't restricted to one bucket and could
+ * overflow KEY_SIZE
+ */
+ if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+ SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+ SET_KEY_SIZE(l, USHRT_MAX);
+
+ bch_cut_front(l, r);
+ return false;
+ }
+
+ if (KEY_CSUM(l)) {
+ if (KEY_CSUM(r))
+ l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+ else
+ SET_KEY_CSUM(l, 0);
+ }
+
+ SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+ SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+
+ return true;
+}
+
+const struct btree_keys_ops bch_extent_keys_ops = {
+ .sort_cmp = bch_extent_sort_cmp,
+ .sort_fixup = bch_extent_sort_fixup,
+ .insert_fixup = bch_extent_insert_fixup,
+ .key_invalid = bch_extent_invalid,
+ .key_bad = bch_extent_bad,
+ .key_merge = bch_extent_merge,
+ .key_to_text = bch_extent_to_text,
+ .key_dump = bch_bkey_dump,
+ .is_extents = true,
+};
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
new file mode 100644
index 000000000..e2ed54054
--- /dev/null
+++ b/drivers/md/bcache/extents.h
@@ -0,0 +1,14 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+
+extern const struct btree_keys_ops bch_btree_keys_ops;
+extern const struct btree_keys_ops bch_extent_keys_ops;
+
+struct bkey;
+struct cache_set;
+
+void bch_extent_to_text(char *, size_t, const struct bkey *);
+bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+bool __bch_extent_invalid(struct cache_set *, const struct bkey *);
+
+#endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
new file mode 100644
index 000000000..fa028fa82
--- /dev/null
+++ b/drivers/md/bcache/io.c
@@ -0,0 +1,243 @@
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "bset.h"
+#include "debug.h"
+
+#include <linux/blkdev.h>
+
+static unsigned bch_bio_max_sectors(struct bio *bio)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ unsigned ret = 0, seg = 0;
+
+ if (bio->bi_rw & REQ_DISCARD)
+ return min(bio_sectors(bio), q->limits.max_discard_sectors);
+
+ bio_for_each_segment(bv, bio, iter) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_iter.bi_sector,
+ .bi_size = ret << 9,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (seg == min_t(unsigned, BIO_MAX_PAGES,
+ queue_max_segments(q)))
+ break;
+
+ if (q->merge_bvec_fn &&
+ q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
+ break;
+
+ seg++;
+ ret += bv.bv_len >> 9;
+ }
+
+ ret = min(ret, queue_max_sectors(q));
+
+ WARN_ON(!ret);
+ ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9);
+
+ return ret;
+}
+
+static void bch_bio_submit_split_done(struct closure *cl)
+{
+ struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+ s->bio->bi_end_io = s->bi_end_io;
+ s->bio->bi_private = s->bi_private;
+ bio_endio_nodec(s->bio, 0);
+
+ closure_debug_destroy(&s->cl);
+ mempool_free(s, s->p->bio_split_hook);
+}
+
+static void bch_bio_submit_split_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+ if (error)
+ clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
+
+ bio_put(bio);
+ closure_put(cl);
+}
+
+void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
+{
+ struct bio_split_hook *s;
+ struct bio *n;
+
+ if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
+ goto submit;
+
+ if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
+ goto submit;
+
+ s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
+ closure_init(&s->cl, NULL);
+
+ s->bio = bio;
+ s->p = p;
+ s->bi_end_io = bio->bi_end_io;
+ s->bi_private = bio->bi_private;
+ bio_get(bio);
+
+ do {
+ n = bio_next_split(bio, bch_bio_max_sectors(bio),
+ GFP_NOIO, s->p->bio_split);
+
+ n->bi_end_io = bch_bio_submit_split_endio;
+ n->bi_private = &s->cl;
+
+ closure_get(&s->cl);
+ generic_make_request(n);
+ } while (n != bio);
+
+ continue_at(&s->cl, bch_bio_submit_split_done, NULL);
+submit:
+ generic_make_request(bio);
+}
+
+/* Bios with headers */
+
+void bch_bbio_free(struct bio *bio, struct cache_set *c)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ mempool_free(b, c->bio_meta);
+}
+
+struct bio *bch_bbio_alloc(struct cache_set *c)
+{
+ struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+ struct bio *bio = &b->bio;
+
+ bio_init(bio);
+ bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+ bio->bi_max_vecs = bucket_pages(c);
+ bio->bi_io_vec = bio->bi_inline_vecs;
+
+ return bio;
+}
+
+void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+
+ bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0);
+ bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev;
+
+ b->submit_time_us = local_clock_us();
+ closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
+}
+
+void bch_submit_bbio(struct bio *bio, struct cache_set *c,
+ struct bkey *k, unsigned ptr)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ bch_bkey_copy_single_ptr(&b->key, k, ptr);
+ __bch_submit_bbio(bio, c);
+}
+
+/* IO errors */
+
+void bch_count_io_errors(struct cache *ca, int error, const char *m)
+{
+ /*
+ * The halflife of an error is:
+ * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
+ */
+
+ if (ca->set->error_decay) {
+ unsigned count = atomic_inc_return(&ca->io_count);
+
+ while (count > ca->set->error_decay) {
+ unsigned errors;
+ unsigned old = count;
+ unsigned new = count - ca->set->error_decay;
+
+ /*
+ * First we subtract refresh from count; each time we
+ * succesfully do so, we rescale the errors once:
+ */
+
+ count = atomic_cmpxchg(&ca->io_count, old, new);
+
+ if (count == old) {
+ count = new;
+
+ errors = atomic_read(&ca->io_errors);
+ do {
+ old = errors;
+ new = ((uint64_t) errors * 127) / 128;
+ errors = atomic_cmpxchg(&ca->io_errors,
+ old, new);
+ } while (old != errors);
+ }
+ }
+ }
+
+ if (error) {
+ char buf[BDEVNAME_SIZE];
+ unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
+ &ca->io_errors);
+ errors >>= IO_ERROR_SHIFT;
+
+ if (errors < ca->set->error_limit)
+ pr_err("%s: IO error on %s, recovering",
+ bdevname(ca->bdev, buf), m);
+ else
+ bch_cache_set_error(ca->set,
+ "%s: too many IO errors %s",
+ bdevname(ca->bdev, buf), m);
+ }
+}
+
+void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
+ int error, const char *m)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ struct cache *ca = PTR_CACHE(c, &b->key, 0);
+
+ unsigned threshold = bio->bi_rw & REQ_WRITE
+ ? c->congested_write_threshold_us
+ : c->congested_read_threshold_us;
+
+ if (threshold) {
+ unsigned t = local_clock_us();
+
+ int us = t - b->submit_time_us;
+ int congested = atomic_read(&c->congested);
+
+ if (us > (int) threshold) {
+ int ms = us / 1024;
+ c->congested_last_us = t;
+
+ ms = min(ms, CONGESTED_MAX + congested);
+ atomic_sub(ms, &c->congested);
+ } else if (congested < 0)
+ atomic_inc(&c->congested);
+ }
+
+ bch_count_io_errors(ca, error, m);
+}
+
+void bch_bbio_endio(struct cache_set *c, struct bio *bio,
+ int error, const char *m)
+{
+ struct closure *cl = bio->bi_private;
+
+ bch_bbio_count_io_errors(c, bio, error, m);
+ bio_put(bio);
+ closure_put(cl);
+}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
new file mode 100644
index 000000000..fe080ad0e
--- /dev/null
+++ b/drivers/md/bcache/journal.c
@@ -0,0 +1,821 @@
+/*
+ * bcache journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+
+#include <trace/events/bcache.h>
+
+/*
+ * Journal replay/recovery:
+ *
+ * This code is all driven from run_cache_set(); we first read the journal
+ * entries, do some other stuff, then we mark all the keys in the journal
+ * entries (same as garbage collection would), then we replay them - reinserting
+ * them into the cache in precisely the same order as they appear in the
+ * journal.
+ *
+ * We only journal keys that go in leaf nodes, which simplifies things quite a
+ * bit.
+ */
+
+static void journal_read_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ closure_put(cl);
+}
+
+static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ unsigned bucket_index)
+{
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio;
+
+ struct journal_replay *i;
+ struct jset *j, *data = ca->set->journal.w[0].data;
+ struct closure cl;
+ unsigned len, left, offset = 0;
+ int ret = 0;
+ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+
+ closure_init_stack(&cl);
+
+ pr_debug("reading %u", bucket_index);
+
+ while (offset < ca->sb.bucket_size) {
+reread: left = ca->sb.bucket_size - offset;
+ len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
+
+ bio_reset(bio);
+ bio->bi_iter.bi_sector = bucket + offset;
+ bio->bi_bdev = ca->bdev;
+ bio->bi_rw = READ;
+ bio->bi_iter.bi_size = len << 9;
+
+ bio->bi_end_io = journal_read_endio;
+ bio->bi_private = &cl;
+ bch_bio_map(bio, data);
+
+ closure_bio_submit(bio, &cl, ca);
+ closure_sync(&cl);
+
+ /* This function could be simpler now since we no longer write
+ * journal entries that overlap bucket boundaries; this means
+ * the start of a bucket will always have a valid journal entry
+ * if it has any journal entries at all.
+ */
+
+ j = data;
+ while (len) {
+ struct list_head *where;
+ size_t blocks, bytes = set_bytes(j);
+
+ if (j->magic != jset_magic(&ca->sb)) {
+ pr_debug("%u: bad magic", bucket_index);
+ return ret;
+ }
+
+ if (bytes > left << 9 ||
+ bytes > PAGE_SIZE << JSET_BITS) {
+ pr_info("%u: too big, %zu bytes, offset %u",
+ bucket_index, bytes, offset);
+ return ret;
+ }
+
+ if (bytes > len << 9)
+ goto reread;
+
+ if (j->csum != csum_set(j)) {
+ pr_info("%u: bad csum, %zu bytes, offset %u",
+ bucket_index, bytes, offset);
+ return ret;
+ }
+
+ blocks = set_blocks(j, block_bytes(ca->set));
+
+ while (!list_empty(list)) {
+ i = list_first_entry(list,
+ struct journal_replay, list);
+ if (i->j.seq >= j->last_seq)
+ break;
+ list_del(&i->list);
+ kfree(i);
+ }
+
+ list_for_each_entry_reverse(i, list, list) {
+ if (j->seq == i->j.seq)
+ goto next_set;
+
+ if (j->seq < i->j.last_seq)
+ goto next_set;
+
+ if (j->seq > i->j.seq) {
+ where = &i->list;
+ goto add;
+ }
+ }
+
+ where = list;
+add:
+ i = kmalloc(offsetof(struct journal_replay, j) +
+ bytes, GFP_KERNEL);
+ if (!i)
+ return -ENOMEM;
+ memcpy(&i->j, j, bytes);
+ list_add(&i->list, where);
+ ret = 1;
+
+ ja->seq[bucket_index] = j->seq;
+next_set:
+ offset += blocks * ca->sb.block_size;
+ len -= blocks * ca->sb.block_size;
+ j = ((void *) j) + blocks * block_bytes(ca);
+ }
+ }
+
+ return ret;
+}
+
+int bch_journal_read(struct cache_set *c, struct list_head *list)
+{
+#define read_bucket(b) \
+ ({ \
+ int ret = journal_read_bucket(ca, list, b); \
+ __set_bit(b, bitmap); \
+ if (ret < 0) \
+ return ret; \
+ ret; \
+ })
+
+ struct cache *ca;
+ unsigned iter;
+
+ for_each_cache(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+ unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
+ unsigned i, l, r, m;
+ uint64_t seq;
+
+ bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ pr_debug("%u journal buckets", ca->sb.njournal_buckets);
+
+ /*
+ * Read journal buckets ordered by golden ratio hash to quickly
+ * find a sequence of buckets with valid journal entries
+ */
+ for (i = 0; i < ca->sb.njournal_buckets; i++) {
+ l = (i * 2654435769U) % ca->sb.njournal_buckets;
+
+ if (test_bit(l, bitmap))
+ break;
+
+ if (read_bucket(l))
+ goto bsearch;
+ }
+
+ /*
+ * If that fails, check all the buckets we haven't checked
+ * already
+ */
+ pr_debug("falling back to linear search");
+
+ for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
+ l < ca->sb.njournal_buckets;
+ l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
+ if (read_bucket(l))
+ goto bsearch;
+
+ /* no journal entries on this device? */
+ if (l == ca->sb.njournal_buckets)
+ continue;
+bsearch:
+ BUG_ON(list_empty(list));
+
+ /* Binary search */
+ m = l;
+ r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+ pr_debug("starting binary search, l %u r %u", l, r);
+
+ while (l + 1 < r) {
+ seq = list_entry(list->prev, struct journal_replay,
+ list)->j.seq;
+
+ m = (l + r) >> 1;
+ read_bucket(m);
+
+ if (seq != list_entry(list->prev, struct journal_replay,
+ list)->j.seq)
+ l = m;
+ else
+ r = m;
+ }
+
+ /*
+ * Read buckets in reverse order until we stop finding more
+ * journal entries
+ */
+ pr_debug("finishing up: m %u njournal_buckets %u",
+ m, ca->sb.njournal_buckets);
+ l = m;
+
+ while (1) {
+ if (!l--)
+ l = ca->sb.njournal_buckets - 1;
+
+ if (l == m)
+ break;
+
+ if (test_bit(l, bitmap))
+ continue;
+
+ if (!read_bucket(l))
+ break;
+ }
+
+ seq = 0;
+
+ for (i = 0; i < ca->sb.njournal_buckets; i++)
+ if (ja->seq[i] > seq) {
+ seq = ja->seq[i];
+ /*
+ * When journal_reclaim() goes to allocate for
+ * the first time, it'll use the bucket after
+ * ja->cur_idx
+ */
+ ja->cur_idx = i;
+ ja->last_idx = ja->discard_idx = (i + 1) %
+ ca->sb.njournal_buckets;
+
+ }
+ }
+
+ if (!list_empty(list))
+ c->journal.seq = list_entry(list->prev,
+ struct journal_replay,
+ list)->j.seq;
+
+ return 0;
+#undef read_bucket
+}
+
+void bch_journal_mark(struct cache_set *c, struct list_head *list)
+{
+ atomic_t p = { 0 };
+ struct bkey *k;
+ struct journal_replay *i;
+ struct journal *j = &c->journal;
+ uint64_t last = j->seq;
+
+ /*
+ * journal.pin should never fill up - we never write a journal
+ * entry when it would fill up. But if for some reason it does, we
+ * iterate over the list in reverse order so that we can just skip that
+ * refcount instead of bugging.
+ */
+
+ list_for_each_entry_reverse(i, list, list) {
+ BUG_ON(last < i->j.seq);
+ i->pin = NULL;
+
+ while (last-- != i->j.seq)
+ if (fifo_free(&j->pin) > 1) {
+ fifo_push_front(&j->pin, p);
+ atomic_set(&fifo_front(&j->pin), 0);
+ }
+
+ if (fifo_free(&j->pin) > 1) {
+ fifo_push_front(&j->pin, p);
+ i->pin = &fifo_front(&j->pin);
+ atomic_set(i->pin, 1);
+ }
+
+ for (k = i->j.start;
+ k < bset_bkey_last(&i->j);
+ k = bkey_next(k))
+ if (!__bch_extent_invalid(c, k)) {
+ unsigned j;
+
+ for (j = 0; j < KEY_PTRS(k); j++)
+ if (ptr_available(c, k, j))
+ atomic_inc(&PTR_BUCKET(c, k, j)->pin);
+
+ bch_initial_mark_key(c, 0, k);
+ }
+ }
+}
+
+int bch_journal_replay(struct cache_set *s, struct list_head *list)
+{
+ int ret = 0, keys = 0, entries = 0;
+ struct bkey *k;
+ struct journal_replay *i =
+ list_entry(list->prev, struct journal_replay, list);
+
+ uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+ struct keylist keylist;
+
+ list_for_each_entry(i, list, list) {
+ BUG_ON(i->pin && atomic_read(i->pin) != 1);
+
+ cache_set_err_on(n != i->j.seq, s,
+"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+
+ for (k = i->j.start;
+ k < bset_bkey_last(&i->j);
+ k = bkey_next(k)) {
+ trace_bcache_journal_replay_key(k);
+
+ bch_keylist_init_single(&keylist, k);
+
+ ret = bch_btree_insert(s, &keylist, i->pin, NULL);
+ if (ret)
+ goto err;
+
+ BUG_ON(!bch_keylist_empty(&keylist));
+ keys++;
+
+ cond_resched();
+ }
+
+ if (i->pin)
+ atomic_dec(i->pin);
+ n = i->j.seq + 1;
+ entries++;
+ }
+
+ pr_info("journal replay done, %i keys in %i entries, seq %llu",
+ keys, entries, end);
+err:
+ while (!list_empty(list)) {
+ i = list_first_entry(list, struct journal_replay, list);
+ list_del(&i->list);
+ kfree(i);
+ }
+
+ return ret;
+}
+
+/* Journalling */
+
+static void btree_flush_write(struct cache_set *c)
+{
+ /*
+ * Try to find the btree node with that references the oldest journal
+ * entry, best is our current candidate and is locked if non NULL:
+ */
+ struct btree *b, *best;
+ unsigned i;
+retry:
+ best = NULL;
+
+ for_each_cached_btree(b, c, i)
+ if (btree_current_write(b)->journal) {
+ if (!best)
+ best = b;
+ else if (journal_pin_cmp(c,
+ btree_current_write(best)->journal,
+ btree_current_write(b)->journal)) {
+ best = b;
+ }
+ }
+
+ b = best;
+ if (b) {
+ mutex_lock(&b->write_lock);
+ if (!btree_current_write(b)->journal) {
+ mutex_unlock(&b->write_lock);
+ /* We raced */
+ goto retry;
+ }
+
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
+ }
+}
+
+#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
+
+static void journal_discard_endio(struct bio *bio, int error)
+{
+ struct journal_device *ja =
+ container_of(bio, struct journal_device, discard_bio);
+ struct cache *ca = container_of(ja, struct cache, journal);
+
+ atomic_set(&ja->discard_in_flight, DISCARD_DONE);
+
+ closure_wake_up(&ca->set->journal.wait);
+ closure_put(&ca->set->cl);
+}
+
+static void journal_discard_work(struct work_struct *work)
+{
+ struct journal_device *ja =
+ container_of(work, struct journal_device, discard_work);
+
+ submit_bio(0, &ja->discard_bio);
+}
+
+static void do_journal_discard(struct cache *ca)
+{
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->discard_bio;
+
+ if (!ca->discard) {
+ ja->discard_idx = ja->last_idx;
+ return;
+ }
+
+ switch (atomic_read(&ja->discard_in_flight)) {
+ case DISCARD_IN_FLIGHT:
+ return;
+
+ case DISCARD_DONE:
+ ja->discard_idx = (ja->discard_idx + 1) %
+ ca->sb.njournal_buckets;
+
+ atomic_set(&ja->discard_in_flight, DISCARD_READY);
+ /* fallthrough */
+
+ case DISCARD_READY:
+ if (ja->discard_idx == ja->last_idx)
+ return;
+
+ atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
+
+ bio_init(bio);
+ bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
+ ca->sb.d[ja->discard_idx]);
+ bio->bi_bdev = ca->bdev;
+ bio->bi_rw = REQ_WRITE|REQ_DISCARD;
+ bio->bi_max_vecs = 1;
+ bio->bi_io_vec = bio->bi_inline_vecs;
+ bio->bi_iter.bi_size = bucket_bytes(ca);
+ bio->bi_end_io = journal_discard_endio;
+
+ closure_get(&ca->set->cl);
+ INIT_WORK(&ja->discard_work, journal_discard_work);
+ schedule_work(&ja->discard_work);
+ }
+}
+
+static void journal_reclaim(struct cache_set *c)
+{
+ struct bkey *k = &c->journal.key;
+ struct cache *ca;
+ uint64_t last_seq;
+ unsigned iter, n = 0;
+ atomic_t p;
+
+ while (!atomic_read(&fifo_front(&c->journal.pin)))
+ fifo_pop(&c->journal.pin, p);
+
+ last_seq = last_seq(&c->journal);
+
+ /* Update last_idx */
+
+ for_each_cache(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+
+ while (ja->last_idx != ja->cur_idx &&
+ ja->seq[ja->last_idx] < last_seq)
+ ja->last_idx = (ja->last_idx + 1) %
+ ca->sb.njournal_buckets;
+ }
+
+ for_each_cache(ca, c, iter)
+ do_journal_discard(ca);
+
+ if (c->journal.blocks_free)
+ goto out;
+
+ /*
+ * Allocate:
+ * XXX: Sort by free journal space
+ */
+
+ for_each_cache(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+ unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+
+ /* No space available on this device */
+ if (next == ja->discard_idx)
+ continue;
+
+ ja->cur_idx = next;
+ k->ptr[n++] = PTR(0,
+ bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+ ca->sb.nr_this_dev);
+ }
+
+ bkey_init(k);
+ SET_KEY_PTRS(k, n);
+
+ if (n)
+ c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+out:
+ if (!journal_full(&c->journal))
+ __closure_wake_up(&c->journal.wait);
+}
+
+void bch_journal_next(struct journal *j)
+{
+ atomic_t p = { 1 };
+
+ j->cur = (j->cur == j->w)
+ ? &j->w[1]
+ : &j->w[0];
+
+ /*
+ * The fifo_push() needs to happen at the same time as j->seq is
+ * incremented for last_seq() to be calculated correctly
+ */
+ BUG_ON(!fifo_push(&j->pin, p));
+ atomic_set(&fifo_back(&j->pin), 1);
+
+ j->cur->data->seq = ++j->seq;
+ j->cur->dirty = false;
+ j->cur->need_write = false;
+ j->cur->data->keys = 0;
+
+ if (fifo_full(&j->pin))
+ pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
+}
+
+static void journal_write_endio(struct bio *bio, int error)
+{
+ struct journal_write *w = bio->bi_private;
+
+ cache_set_err_on(error, w->c, "journal io error");
+ closure_put(&w->c->journal.io);
+}
+
+static void journal_write(struct closure *);
+
+static void journal_write_done(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct journal_write *w = (j->cur == j->w)
+ ? &j->w[1]
+ : &j->w[0];
+
+ __closure_wake_up(&w->wait);
+ continue_at_nobarrier(cl, journal_write, system_wq);
+}
+
+static void journal_write_unlock(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+
+ c->journal.io_in_flight = 0;
+ spin_unlock(&c->journal.lock);
+}
+
+static void journal_write_unlocked(struct closure *cl)
+ __releases(c->journal.lock)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+ struct cache *ca;
+ struct journal_write *w = c->journal.cur;
+ struct bkey *k = &c->journal.key;
+ unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+ c->sb.block_size;
+
+ struct bio *bio;
+ struct bio_list list;
+ bio_list_init(&list);
+
+ if (!w->need_write) {
+ closure_return_with_destructor(cl, journal_write_unlock);
+ } else if (journal_full(&c->journal)) {
+ journal_reclaim(c);
+ spin_unlock(&c->journal.lock);
+
+ btree_flush_write(c);
+ continue_at(cl, journal_write, system_wq);
+ }
+
+ c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
+
+ w->data->btree_level = c->root->level;
+
+ bkey_copy(&w->data->btree_root, &c->root->key);
+ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+
+ for_each_cache(ca, c, i)
+ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+
+ w->data->magic = jset_magic(&c->sb);
+ w->data->version = BCACHE_JSET_VERSION;
+ w->data->last_seq = last_seq(&c->journal);
+ w->data->csum = csum_set(w->data);
+
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ ca = PTR_CACHE(c, k, i);
+ bio = &ca->journal.bio;
+
+ atomic_long_add(sectors, &ca->meta_sectors_written);
+
+ bio_reset(bio);
+ bio->bi_iter.bi_sector = PTR_OFFSET(k, i);
+ bio->bi_bdev = ca->bdev;
+ bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
+ bio->bi_iter.bi_size = sectors << 9;
+
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = w;
+ bch_bio_map(bio, w->data);
+
+ trace_bcache_journal_write(bio);
+ bio_list_add(&list, bio);
+
+ SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
+
+ ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+ }
+
+ atomic_dec_bug(&fifo_back(&c->journal.pin));
+ bch_journal_next(&c->journal);
+ journal_reclaim(c);
+
+ spin_unlock(&c->journal.lock);
+
+ while ((bio = bio_list_pop(&list)))
+ closure_bio_submit(bio, cl, c->cache[0]);
+
+ continue_at(cl, journal_write_done, NULL);
+}
+
+static void journal_write(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+
+ spin_lock(&c->journal.lock);
+ journal_write_unlocked(cl);
+}
+
+static void journal_try_write(struct cache_set *c)
+ __releases(c->journal.lock)
+{
+ struct closure *cl = &c->journal.io;
+ struct journal_write *w = c->journal.cur;
+
+ w->need_write = true;
+
+ if (!c->journal.io_in_flight) {
+ c->journal.io_in_flight = 1;
+ closure_call(cl, journal_write_unlocked, NULL, &c->cl);
+ } else {
+ spin_unlock(&c->journal.lock);
+ }
+}
+
+static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ unsigned nkeys)
+{
+ size_t sectors;
+ struct closure cl;
+ bool wait = false;
+
+ closure_init_stack(&cl);
+
+ spin_lock(&c->journal.lock);
+
+ while (1) {
+ struct journal_write *w = c->journal.cur;
+
+ sectors = __set_blocks(w->data, w->data->keys + nkeys,
+ block_bytes(c)) * c->sb.block_size;
+
+ if (sectors <= min_t(size_t,
+ c->journal.blocks_free * c->sb.block_size,
+ PAGE_SECTORS << JSET_BITS))
+ return w;
+
+ if (wait)
+ closure_wait(&c->journal.wait, &cl);
+
+ if (!journal_full(&c->journal)) {
+ if (wait)
+ trace_bcache_journal_entry_full(c);
+
+ /*
+ * XXX: If we were inserting so many keys that they
+ * won't fit in an _empty_ journal write, we'll
+ * deadlock. For now, handle this in
+ * bch_keylist_realloc() - but something to think about.
+ */
+ BUG_ON(!w->data->keys);
+
+ journal_try_write(c); /* unlocks */
+ } else {
+ if (wait)
+ trace_bcache_journal_full(c);
+
+ journal_reclaim(c);
+ spin_unlock(&c->journal.lock);
+
+ btree_flush_write(c);
+ }
+
+ closure_sync(&cl);
+ spin_lock(&c->journal.lock);
+ wait = true;
+ }
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+ struct cache_set *c = container_of(to_delayed_work(work),
+ struct cache_set,
+ journal.work);
+ spin_lock(&c->journal.lock);
+ if (c->journal.cur->dirty)
+ journal_try_write(c);
+ else
+ spin_unlock(&c->journal.lock);
+}
+
+/*
+ * Entry point to the journalling code - bio_insert() and btree_invalidate()
+ * pass bch_journal() a list of keys to be journalled, and then
+ * bch_journal() hands those same keys off to btree_insert_async()
+ */
+
+atomic_t *bch_journal(struct cache_set *c,
+ struct keylist *keys,
+ struct closure *parent)
+{
+ struct journal_write *w;
+ atomic_t *ret;
+
+ if (!CACHE_SYNC(&c->sb))
+ return NULL;
+
+ w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
+
+ memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
+ w->data->keys += bch_keylist_nkeys(keys);
+
+ ret = &fifo_back(&c->journal.pin);
+ atomic_inc(ret);
+
+ if (parent) {
+ closure_wait(&w->wait, parent);
+ journal_try_write(c);
+ } else if (!w->dirty) {
+ w->dirty = true;
+ schedule_delayed_work(&c->journal.work,
+ msecs_to_jiffies(c->journal_delay_ms));
+ spin_unlock(&c->journal.lock);
+ } else {
+ spin_unlock(&c->journal.lock);
+ }
+
+
+ return ret;
+}
+
+void bch_journal_meta(struct cache_set *c, struct closure *cl)
+{
+ struct keylist keys;
+ atomic_t *ref;
+
+ bch_keylist_init(&keys);
+
+ ref = bch_journal(c, &keys, cl);
+ if (ref)
+ atomic_dec_bug(ref);
+}
+
+void bch_journal_free(struct cache_set *c)
+{
+ free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
+ free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
+ free_fifo(&c->journal.pin);
+}
+
+int bch_journal_alloc(struct cache_set *c)
+{
+ struct journal *j = &c->journal;
+
+ spin_lock_init(&j->lock);
+ INIT_DELAYED_WORK(&j->work, journal_write_work);
+
+ c->journal_delay_ms = 100;
+
+ j->w[0].c = c;
+ j->w[1].c = c;
+
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
+ !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
new file mode 100644
index 000000000..e3c39457a
--- /dev/null
+++ b/drivers/md/bcache/journal.h
@@ -0,0 +1,179 @@
+#ifndef _BCACHE_JOURNAL_H
+#define _BCACHE_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The journal is treated as a circular buffer of buckets - a journal entry
+ * never spans two buckets. This means (not implemented yet) we can resize the
+ * journal at runtime, and will be needed for bcache on raw flash support.
+ *
+ * Journal entries contain a list of keys, ordered by the time they were
+ * inserted; thus journal replay just has to reinsert the keys.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
+ * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
+ * from cache misses, which don't have to be journaled, and for writeback and
+ * moving gc we work around it by flushing the btree to disk before updating the
+ * gc information. But it is a potential issue with incremental garbage
+ * collection, and it's fragile.
+ *
+ * OPEN JOURNAL ENTRIES:
+ *
+ * Each journal entry contains, in the header, the sequence number of the last
+ * journal entry still open - i.e. that has keys that haven't been flushed to
+ * disk in the btree.
+ *
+ * We track this by maintaining a refcount for every open journal entry, in a
+ * fifo; each entry in the fifo corresponds to a particular journal
+ * entry/sequence number. When the refcount at the tail of the fifo goes to
+ * zero, we pop it off - thus, the size of the fifo tells us the number of open
+ * journal entries
+ *
+ * We take a refcount on a journal entry when we add some keys to a journal
+ * entry that we're going to insert (held by struct btree_op), and then when we
+ * insert those keys into the btree the btree write we're setting up takes a
+ * copy of that refcount (held by struct btree_write). That refcount is dropped
+ * when the btree write completes.
+ *
+ * A struct btree_write can only hold a refcount on a single journal entry, but
+ * might contain keys for many journal entries - we handle this by making sure
+ * it always has a refcount on the _oldest_ journal entry of all the journal
+ * entries it has keys for.
+ *
+ * JOURNAL RECLAIM:
+ *
+ * As mentioned previously, our fifo of refcounts tells us the number of open
+ * journal entries; from that and the current journal sequence number we compute
+ * last_seq - the oldest journal entry we still need. We write last_seq in each
+ * journal entry, and we also have to keep track of where it exists on disk so
+ * we don't overwrite it when we loop around the journal.
+ *
+ * To do that we track, for each journal bucket, the sequence number of the
+ * newest journal entry it contains - if we don't need that journal entry we
+ * don't need anything in that bucket anymore. From that we track the last
+ * journal bucket we still need; all this is tracked in struct journal_device
+ * and updated by journal_reclaim().
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+ struct list_head list;
+ atomic_t *pin;
+ struct jset j;
+};
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_write {
+ struct jset *data;
+#define JSET_BITS 3
+
+ struct cache_set *c;
+ struct closure_waitlist wait;
+ bool dirty;
+ bool need_write;
+};
+
+/* Embedded in struct cache_set */
+struct journal {
+ spinlock_t lock;
+ /* used when waiting because the journal was full */
+ struct closure_waitlist wait;
+ struct closure io;
+ int io_in_flight;
+ struct delayed_work work;
+
+ /* Number of blocks free in the bucket(s) we're currently writing to */
+ unsigned blocks_free;
+ uint64_t seq;
+ DECLARE_FIFO(atomic_t, pin);
+
+ BKEY_PADDED(key);
+
+ struct journal_write w[2], *cur;
+};
+
+/*
+ * Embedded in struct cache. First three fields refer to the array of journal
+ * buckets, in cache_sb.
+ */
+struct journal_device {
+ /*
+ * For each journal bucket, contains the max sequence number of the
+ * journal writes it contains - so we know when a bucket can be reused.
+ */
+ uint64_t seq[SB_JOURNAL_BUCKETS];
+
+ /* Journal bucket we're currently writing to */
+ unsigned cur_idx;
+
+ /* Last journal bucket that still contains an open journal entry */
+ unsigned last_idx;
+
+ /* Next journal bucket to be discarded */
+ unsigned discard_idx;
+
+#define DISCARD_READY 0
+#define DISCARD_IN_FLIGHT 1
+#define DISCARD_DONE 2
+ /* 1 - discard in flight, -1 - discard completed */
+ atomic_t discard_in_flight;
+
+ struct work_struct discard_work;
+ struct bio discard_bio;
+ struct bio_vec discard_bv;
+
+ /* Bio for journal reads/writes to this device */
+ struct bio bio;
+ struct bio_vec bv[8];
+};
+
+#define journal_pin_cmp(c, l, r) \
+ (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
+
+#define JOURNAL_PIN 20000
+
+#define journal_full(j) \
+ (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
+
+struct closure;
+struct cache_set;
+struct btree_op;
+struct keylist;
+
+atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
+void bch_journal_next(struct journal *);
+void bch_journal_mark(struct cache_set *, struct list_head *);
+void bch_journal_meta(struct cache_set *, struct closure *);
+int bch_journal_read(struct cache_set *, struct list_head *);
+int bch_journal_replay(struct cache_set *, struct list_head *);
+
+void bch_journal_free(struct cache_set *);
+int bch_journal_alloc(struct cache_set *);
+
+#endif /* _BCACHE_JOURNAL_H */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
new file mode 100644
index 000000000..cd7490311
--- /dev/null
+++ b/drivers/md/bcache/movinggc.c
@@ -0,0 +1,256 @@
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <trace/events/bcache.h>
+
+struct moving_io {
+ struct closure cl;
+ struct keybuf_key *w;
+ struct data_insert_op op;
+ struct bbio bio;
+};
+
+static bool moving_pred(struct keybuf *buf, struct bkey *k)
+{
+ struct cache_set *c = container_of(buf, struct cache_set,
+ moving_gc_keys);
+ unsigned i;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i) &&
+ GC_MOVE(PTR_BUCKET(c, k, i)))
+ return true;
+
+ return false;
+}
+
+/* Moving GC - IO loop */
+
+static void moving_io_destructor(struct closure *cl)
+{
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
+ kfree(io);
+}
+
+static void write_moving_finish(struct closure *cl)
+{
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
+ struct bio *bio = &io->bio.bio;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i)
+ __free_page(bv->bv_page);
+
+ if (io->op.replace_collision)
+ trace_bcache_gc_copy_collision(&io->w->key);
+
+ bch_keybuf_del(&io->op.c->moving_gc_keys, io->w);
+
+ up(&io->op.c->moving_in_flight);
+
+ closure_return_with_destructor(cl, moving_io_destructor);
+}
+
+static void read_moving_endio(struct bio *bio, int error)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ struct moving_io *io = container_of(bio->bi_private,
+ struct moving_io, cl);
+
+ if (error)
+ io->op.error = error;
+ else if (!KEY_DIRTY(&b->key) &&
+ ptr_stale(io->op.c, &b->key, 0)) {
+ io->op.error = -EINTR;
+ }
+
+ bch_bbio_endio(io->op.c, bio, error, "reading data to move");
+}
+
+static void moving_init(struct moving_io *io)
+{
+ struct bio *bio = &io->bio.bio;
+
+ bio_init(bio);
+ bio_get(bio);
+ bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+ bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9;
+ bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
+ PAGE_SECTORS);
+ bio->bi_private = &io->cl;
+ bio->bi_io_vec = bio->bi_inline_vecs;
+ bch_bio_map(bio, NULL);
+}
+
+static void write_moving(struct closure *cl)
+{
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
+ struct data_insert_op *op = &io->op;
+
+ if (!op->error) {
+ moving_init(io);
+
+ io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
+ op->write_prio = 1;
+ op->bio = &io->bio.bio;
+
+ op->writeback = KEY_DIRTY(&io->w->key);
+ op->csum = KEY_CSUM(&io->w->key);
+
+ bkey_copy(&op->replace_key, &io->w->key);
+ op->replace = true;
+
+ closure_call(&op->cl, bch_data_insert, NULL, cl);
+ }
+
+ continue_at(cl, write_moving_finish, op->wq);
+}
+
+static void read_moving_submit(struct closure *cl)
+{
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
+ struct bio *bio = &io->bio.bio;
+
+ bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
+
+ continue_at(cl, write_moving, io->op.wq);
+}
+
+static void read_moving(struct cache_set *c)
+{
+ struct keybuf_key *w;
+ struct moving_io *io;
+ struct bio *bio;
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ /* XXX: if we error, background writeback could stall indefinitely */
+
+ while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
+ w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
+ &MAX_KEY, moving_pred);
+ if (!w)
+ break;
+
+ if (ptr_stale(c, &w->key, 0)) {
+ bch_keybuf_del(&c->moving_gc_keys, w);
+ continue;
+ }
+
+ io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
+ * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
+ if (!io)
+ goto err;
+
+ w->private = io;
+ io->w = w;
+ io->op.inode = KEY_INODE(&w->key);
+ io->op.c = c;
+ io->op.wq = c->moving_gc_wq;
+
+ moving_init(io);
+ bio = &io->bio.bio;
+
+ bio->bi_rw = READ;
+ bio->bi_end_io = read_moving_endio;
+
+ if (bio_alloc_pages(bio, GFP_KERNEL))
+ goto err;
+
+ trace_bcache_gc_copy(&w->key);
+
+ down(&c->moving_in_flight);
+ closure_call(&io->cl, read_moving_submit, NULL, &cl);
+ }
+
+ if (0) {
+err: if (!IS_ERR_OR_NULL(w->private))
+ kfree(w->private);
+
+ bch_keybuf_del(&c->moving_gc_keys, w);
+ }
+
+ closure_sync(&cl);
+}
+
+static bool bucket_cmp(struct bucket *l, struct bucket *r)
+{
+ return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
+}
+
+static unsigned bucket_heap_top(struct cache *ca)
+{
+ struct bucket *b;
+ return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
+}
+
+void bch_moving_gc(struct cache_set *c)
+{
+ struct cache *ca;
+ struct bucket *b;
+ unsigned i;
+
+ if (!c->copy_gc_enabled)
+ return;
+
+ mutex_lock(&c->bucket_lock);
+
+ for_each_cache(ca, c, i) {
+ unsigned sectors_to_move = 0;
+ unsigned reserve_sectors = ca->sb.bucket_size *
+ fifo_used(&ca->free[RESERVE_MOVINGGC]);
+
+ ca->heap.used = 0;
+
+ for_each_bucket(b, ca) {
+ if (GC_MARK(b) == GC_MARK_METADATA ||
+ !GC_SECTORS_USED(b) ||
+ GC_SECTORS_USED(b) == ca->sb.bucket_size ||
+ atomic_read(&b->pin))
+ continue;
+
+ if (!heap_full(&ca->heap)) {
+ sectors_to_move += GC_SECTORS_USED(b);
+ heap_add(&ca->heap, b, bucket_cmp);
+ } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+ sectors_to_move -= bucket_heap_top(ca);
+ sectors_to_move += GC_SECTORS_USED(b);
+
+ ca->heap.data[0] = b;
+ heap_sift(&ca->heap, 0, bucket_cmp);
+ }
+ }
+
+ while (sectors_to_move > reserve_sectors) {
+ heap_pop(&ca->heap, b, bucket_cmp);
+ sectors_to_move -= GC_SECTORS_USED(b);
+ }
+
+ while (heap_pop(&ca->heap, b, bucket_cmp))
+ SET_GC_MOVE(b, 1);
+ }
+
+ mutex_unlock(&c->bucket_lock);
+
+ c->moving_gc_keys.last_scanned = ZERO_KEY;
+
+ read_moving(c);
+}
+
+void bch_moving_init_cache_set(struct cache_set *c)
+{
+ bch_keybuf_init(&c->moving_gc_keys);
+ sema_init(&c->moving_in_flight, 64);
+}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
new file mode 100644
index 000000000..ab43faddb
--- /dev/null
+++ b/drivers/md/bcache/request.c
@@ -0,0 +1,1149 @@
+/*
+ * Main bcache entry point - handle a read or a write request and decide what to
+ * do with it; the make_request functions are called by the block layer.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+#include "writeback.h"
+
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+
+#include <trace/events/bcache.h>
+
+#define CUTOFF_CACHE_ADD 95
+#define CUTOFF_CACHE_READA 90
+
+struct kmem_cache *bch_search_cache;
+
+static void bch_data_insert_start(struct closure *);
+
+static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+{
+ return BDEV_CACHE_MODE(&dc->sb);
+}
+
+static bool verify(struct cached_dev *dc, struct bio *bio)
+{
+ return dc->verify;
+}
+
+static void bio_csum(struct bio *bio, struct bkey *k)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ uint64_t csum = 0;
+
+ bio_for_each_segment(bv, bio, iter) {
+ void *d = kmap(bv.bv_page) + bv.bv_offset;
+ csum = bch_crc64_update(csum, d, bv.bv_len);
+ kunmap(bv.bv_page);
+ }
+
+ k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
+}
+
+/* Insert data into cache */
+
+static void bch_data_insert_keys(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ atomic_t *journal_ref = NULL;
+ struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
+ int ret;
+
+ /*
+ * If we're looping, might already be waiting on
+ * another journal write - can't wait on more than one journal write at
+ * a time
+ *
+ * XXX: this looks wrong
+ */
+#if 0
+ while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
+ closure_sync(&s->cl);
+#endif
+
+ if (!op->replace)
+ journal_ref = bch_journal(op->c, &op->insert_keys,
+ op->flush_journal ? cl : NULL);
+
+ ret = bch_btree_insert(op->c, &op->insert_keys,
+ journal_ref, replace_key);
+ if (ret == -ESRCH) {
+ op->replace_collision = true;
+ } else if (ret) {
+ op->error = -ENOMEM;
+ op->insert_data_done = true;
+ }
+
+ if (journal_ref)
+ atomic_dec_bug(journal_ref);
+
+ if (!op->insert_data_done)
+ continue_at(cl, bch_data_insert_start, op->wq);
+
+ bch_keylist_free(&op->insert_keys);
+ closure_return(cl);
+}
+
+static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
+ struct cache_set *c)
+{
+ size_t oldsize = bch_keylist_nkeys(l);
+ size_t newsize = oldsize + u64s;
+
+ /*
+ * The journalling code doesn't handle the case where the keys to insert
+ * is bigger than an empty write: If we just return -ENOMEM here,
+ * bio_insert() and bio_invalidate() will insert the keys created so far
+ * and finish the rest when the keylist is empty.
+ */
+ if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+ return -ENOMEM;
+
+ return __bch_keylist_realloc(l, u64s);
+}
+
+static void bch_data_invalidate(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ struct bio *bio = op->bio;
+
+ pr_debug("invalidating %i sectors from %llu",
+ bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
+
+ while (bio_sectors(bio)) {
+ unsigned sectors = min(bio_sectors(bio),
+ 1U << (KEY_SIZE_BITS - 1));
+
+ if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
+ goto out;
+
+ bio->bi_iter.bi_sector += sectors;
+ bio->bi_iter.bi_size -= sectors << 9;
+
+ bch_keylist_add(&op->insert_keys,
+ &KEY(op->inode, bio->bi_iter.bi_sector, sectors));
+ }
+
+ op->insert_data_done = true;
+ bio_put(bio);
+out:
+ continue_at(cl, bch_data_insert_keys, op->wq);
+}
+
+static void bch_data_insert_error(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+ /*
+ * Our data write just errored, which means we've got a bunch of keys to
+ * insert that point to data that wasn't succesfully written.
+ *
+ * We don't have to insert those keys but we still have to invalidate
+ * that region of the cache - so, if we just strip off all the pointers
+ * from the keys we'll accomplish just that.
+ */
+
+ struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
+
+ while (src != op->insert_keys.top) {
+ struct bkey *n = bkey_next(src);
+
+ SET_KEY_PTRS(src, 0);
+ memmove(dst, src, bkey_bytes(src));
+
+ dst = bkey_next(dst);
+ src = n;
+ }
+
+ op->insert_keys.top = dst;
+
+ bch_data_insert_keys(cl);
+}
+
+static void bch_data_insert_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+ if (error) {
+ /* TODO: We could try to recover from this. */
+ if (op->writeback)
+ op->error = error;
+ else if (!op->replace)
+ set_closure_fn(cl, bch_data_insert_error, op->wq);
+ else
+ set_closure_fn(cl, NULL, NULL);
+ }
+
+ bch_bbio_endio(op->c, bio, error, "writing data to cache");
+}
+
+static void bch_data_insert_start(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ struct bio *bio = op->bio, *n;
+
+ if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
+ set_gc_sectors(op->c);
+ wake_up_gc(op->c);
+ }
+
+ if (op->bypass)
+ return bch_data_invalidate(cl);
+
+ /*
+ * Journal writes are marked REQ_FLUSH; if the original write was a
+ * flush, it'll wait on the journal write.
+ */
+ bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
+
+ do {
+ unsigned i;
+ struct bkey *k;
+ struct bio_set *split = op->c->bio_split;
+
+ /* 1 for the device pointer and 1 for the chksum */
+ if (bch_keylist_realloc(&op->insert_keys,
+ 3 + (op->csum ? 1 : 0),
+ op->c))
+ continue_at(cl, bch_data_insert_keys, op->wq);
+
+ k = op->insert_keys.top;
+ bkey_init(k);
+ SET_KEY_INODE(k, op->inode);
+ SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
+
+ if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
+ op->write_point, op->write_prio,
+ op->writeback))
+ goto err;
+
+ n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+
+ n->bi_end_io = bch_data_insert_endio;
+ n->bi_private = cl;
+
+ if (op->writeback) {
+ SET_KEY_DIRTY(k, true);
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ SET_GC_MARK(PTR_BUCKET(op->c, k, i),
+ GC_MARK_DIRTY);
+ }
+
+ SET_KEY_CSUM(k, op->csum);
+ if (KEY_CSUM(k))
+ bio_csum(n, k);
+
+ trace_bcache_cache_insert(k);
+ bch_keylist_push(&op->insert_keys);
+
+ n->bi_rw |= REQ_WRITE;
+ bch_submit_bbio(n, op->c, k, 0);
+ } while (n != bio);
+
+ op->insert_data_done = true;
+ continue_at(cl, bch_data_insert_keys, op->wq);
+err:
+ /* bch_alloc_sectors() blocks if s->writeback = true */
+ BUG_ON(op->writeback);
+
+ /*
+ * But if it's not a writeback write we'd rather just bail out if
+ * there aren't any buckets ready to write to - it might take awhile and
+ * we might be starving btree writes for gc or something.
+ */
+
+ if (!op->replace) {
+ /*
+ * Writethrough write: We can't complete the write until we've
+ * updated the index. But we don't want to delay the write while
+ * we wait for buckets to be freed up, so just invalidate the
+ * rest of the write.
+ */
+ op->bypass = true;
+ return bch_data_invalidate(cl);
+ } else {
+ /*
+ * From a cache miss, we can just insert the keys for the data
+ * we have written or bail out if we didn't do anything.
+ */
+ op->insert_data_done = true;
+ bio_put(bio);
+
+ if (!bch_keylist_empty(&op->insert_keys))
+ continue_at(cl, bch_data_insert_keys, op->wq);
+ else
+ closure_return(cl);
+ }
+}
+
+/**
+ * bch_data_insert - stick some data in the cache
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data had to be fragmented there will be multiple keys); after the
+ * data is written it calls bch_journal, and after the keys have been added to
+ * the next journal write they're inserted into the btree.
+ *
+ * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
+ * and op->inode is used for the key inode.
+ *
+ * If s->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by s->cache_bio and op->inode.
+ */
+void bch_data_insert(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+ trace_bcache_write(op->c, op->inode, op->bio,
+ op->writeback, op->bypass);
+
+ bch_keylist_init(&op->insert_keys);
+ bio_get(op->bio);
+ bch_data_insert_start(cl);
+}
+
+/* Congested? */
+
+unsigned bch_get_congested(struct cache_set *c)
+{
+ int i;
+ long rand;
+
+ if (!c->congested_read_threshold_us &&
+ !c->congested_write_threshold_us)
+ return 0;
+
+ i = (local_clock_us() - c->congested_last_us) / 1024;
+ if (i < 0)
+ return 0;
+
+ i += atomic_read(&c->congested);
+ if (i >= 0)
+ return 0;
+
+ i += CONGESTED_MAX;
+
+ if (i > 0)
+ i = fract_exp_two(i, 6);
+
+ rand = get_random_int();
+ i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+ return i > 0 ? i : 1;
+}
+
+static void add_sequential(struct task_struct *t)
+{
+ ewma_add(t->sequential_io_avg,
+ t->sequential_io, 8, 0);
+
+ t->sequential_io = 0;
+}
+
+static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
+{
+ return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
+}
+
+static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
+{
+ struct cache_set *c = dc->disk.c;
+ unsigned mode = cache_mode(dc, bio);
+ unsigned sectors, congested = bch_get_congested(c);
+ struct task_struct *task = current;
+ struct io *i;
+
+ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+ c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
+ (bio->bi_rw & REQ_DISCARD))
+ goto skip;
+
+ if (mode == CACHE_MODE_NONE ||
+ (mode == CACHE_MODE_WRITEAROUND &&
+ (bio->bi_rw & REQ_WRITE)))
+ goto skip;
+
+ if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
+ bio_sectors(bio) & (c->sb.block_size - 1)) {
+ pr_debug("skipping unaligned io");
+ goto skip;
+ }
+
+ if (bypass_torture_test(dc)) {
+ if ((get_random_int() & 3) == 3)
+ goto skip;
+ else
+ goto rescale;
+ }
+
+ if (!congested && !dc->sequential_cutoff)
+ goto rescale;
+
+ if (!congested &&
+ mode == CACHE_MODE_WRITEBACK &&
+ (bio->bi_rw & REQ_WRITE) &&
+ (bio->bi_rw & REQ_SYNC))
+ goto rescale;
+
+ spin_lock(&dc->io_lock);
+
+ hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
+ if (i->last == bio->bi_iter.bi_sector &&
+ time_before(jiffies, i->jiffies))
+ goto found;
+
+ i = list_first_entry(&dc->io_lru, struct io, lru);
+
+ add_sequential(task);
+ i->sequential = 0;
+found:
+ if (i->sequential + bio->bi_iter.bi_size > i->sequential)
+ i->sequential += bio->bi_iter.bi_size;
+
+ i->last = bio_end_sector(bio);
+ i->jiffies = jiffies + msecs_to_jiffies(5000);
+ task->sequential_io = i->sequential;
+
+ hlist_del(&i->hash);
+ hlist_add_head(&i->hash, iohash(dc, i->last));
+ list_move_tail(&i->lru, &dc->io_lru);
+
+ spin_unlock(&dc->io_lock);
+
+ sectors = max(task->sequential_io,
+ task->sequential_io_avg) >> 9;
+
+ if (dc->sequential_cutoff &&
+ sectors >= dc->sequential_cutoff >> 9) {
+ trace_bcache_bypass_sequential(bio);
+ goto skip;
+ }
+
+ if (congested && sectors >= congested) {
+ trace_bcache_bypass_congested(bio);
+ goto skip;
+ }
+
+rescale:
+ bch_rescale_priorities(c, bio_sectors(bio));
+ return false;
+skip:
+ bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
+ return true;
+}
+
+/* Cache lookup */
+
+struct search {
+ /* Stack frame for bio_complete */
+ struct closure cl;
+
+ struct bbio bio;
+ struct bio *orig_bio;
+ struct bio *cache_miss;
+ struct bcache_device *d;
+
+ unsigned insert_bio_sectors;
+ unsigned recoverable:1;
+ unsigned write:1;
+ unsigned read_dirty_data:1;
+
+ unsigned long start_time;
+
+ struct btree_op op;
+ struct data_insert_op iop;
+};
+
+static void bch_cache_read_endio(struct bio *bio, int error)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ struct closure *cl = bio->bi_private;
+ struct search *s = container_of(cl, struct search, cl);
+
+ /*
+ * If the bucket was reused while our bio was in flight, we might have
+ * read the wrong data. Set s->error but not error so it doesn't get
+ * counted against the cache device, but we'll still reread the data
+ * from the backing device.
+ */
+
+ if (error)
+ s->iop.error = error;
+ else if (!KEY_DIRTY(&b->key) &&
+ ptr_stale(s->iop.c, &b->key, 0)) {
+ atomic_long_inc(&s->iop.c->cache_read_races);
+ s->iop.error = -EINTR;
+ }
+
+ bch_bbio_endio(s->iop.c, bio, error, "reading from cache");
+}
+
+/*
+ * Read from a single key, handling the initial cache miss if the key starts in
+ * the middle of the bio
+ */
+static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
+{
+ struct search *s = container_of(op, struct search, op);
+ struct bio *n, *bio = &s->bio.bio;
+ struct bkey *bio_key;
+ unsigned ptr;
+
+ if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
+ return MAP_CONTINUE;
+
+ if (KEY_INODE(k) != s->iop.inode ||
+ KEY_START(k) > bio->bi_iter.bi_sector) {
+ unsigned bio_sectors = bio_sectors(bio);
+ unsigned sectors = KEY_INODE(k) == s->iop.inode
+ ? min_t(uint64_t, INT_MAX,
+ KEY_START(k) - bio->bi_iter.bi_sector)
+ : INT_MAX;
+
+ int ret = s->d->cache_miss(b, s, bio, sectors);
+ if (ret != MAP_CONTINUE)
+ return ret;
+
+ /* if this was a complete miss we shouldn't get here */
+ BUG_ON(bio_sectors <= sectors);
+ }
+
+ if (!KEY_SIZE(k))
+ return MAP_CONTINUE;
+
+ /* XXX: figure out best pointer - for multiple cache devices */
+ ptr = 0;
+
+ PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+
+ if (KEY_DIRTY(k))
+ s->read_dirty_data = true;
+
+ n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
+ KEY_OFFSET(k) - bio->bi_iter.bi_sector),
+ GFP_NOIO, s->d->bio_split);
+
+ bio_key = &container_of(n, struct bbio, bio)->key;
+ bch_bkey_copy_single_ptr(bio_key, k, ptr);
+
+ bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
+ bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
+
+ n->bi_end_io = bch_cache_read_endio;
+ n->bi_private = &s->cl;
+
+ /*
+ * The bucket we're reading from might be reused while our bio
+ * is in flight, and we could then end up reading the wrong
+ * data.
+ *
+ * We guard against this by checking (in cache_read_endio()) if
+ * the pointer is stale again; if so, we treat it as an error
+ * and reread from the backing device (but we don't pass that
+ * error up anywhere).
+ */
+
+ __bch_submit_bbio(n, b->c);
+ return n == bio ? MAP_DONE : MAP_CONTINUE;
+}
+
+static void cache_lookup(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, iop.cl);
+ struct bio *bio = &s->bio.bio;
+ int ret;
+
+ bch_btree_op_init(&s->op, -1);
+
+ ret = bch_btree_map_keys(&s->op, s->iop.c,
+ &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
+ cache_lookup_fn, MAP_END_KEY);
+ if (ret == -EAGAIN)
+ continue_at(cl, cache_lookup, bcache_wq);
+
+ closure_return(cl);
+}
+
+/* Common code for the make_request functions */
+
+static void request_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+
+ if (error) {
+ struct search *s = container_of(cl, struct search, cl);
+ s->iop.error = error;
+ /* Only cache read errors are recoverable */
+ s->recoverable = false;
+ }
+
+ bio_put(bio);
+ closure_put(cl);
+}
+
+static void bio_complete(struct search *s)
+{
+ if (s->orig_bio) {
+ generic_end_io_acct(bio_data_dir(s->orig_bio),
+ &s->d->disk->part0, s->start_time);
+
+ trace_bcache_request_end(s->d, s->orig_bio);
+ bio_endio(s->orig_bio, s->iop.error);
+ s->orig_bio = NULL;
+ }
+}
+
+static void do_bio_hook(struct search *s, struct bio *orig_bio)
+{
+ struct bio *bio = &s->bio.bio;
+
+ bio_init(bio);
+ __bio_clone_fast(bio, orig_bio);
+ bio->bi_end_io = request_endio;
+ bio->bi_private = &s->cl;
+
+ atomic_set(&bio->bi_cnt, 3);
+}
+
+static void search_free(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ bio_complete(s);
+
+ if (s->iop.bio)
+ bio_put(s->iop.bio);
+
+ closure_debug_destroy(cl);
+ mempool_free(s, s->d->c->search);
+}
+
+static inline struct search *search_alloc(struct bio *bio,
+ struct bcache_device *d)
+{
+ struct search *s;
+
+ s = mempool_alloc(d->c->search, GFP_NOIO);
+
+ closure_init(&s->cl, NULL);
+ do_bio_hook(s, bio);
+
+ s->orig_bio = bio;
+ s->cache_miss = NULL;
+ s->d = d;
+ s->recoverable = 1;
+ s->write = (bio->bi_rw & REQ_WRITE) != 0;
+ s->read_dirty_data = 0;
+ s->start_time = jiffies;
+
+ s->iop.c = d->c;
+ s->iop.bio = NULL;
+ s->iop.inode = d->id;
+ s->iop.write_point = hash_long((unsigned long) current, 16);
+ s->iop.write_prio = 0;
+ s->iop.error = 0;
+ s->iop.flags = 0;
+ s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+ s->iop.wq = bcache_wq;
+
+ return s;
+}
+
+/* Cached devices */
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+ search_free(cl);
+ cached_dev_put(dc);
+}
+
+/* Process reads */
+
+static void cached_dev_cache_miss_done(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+
+ if (s->iop.replace_collision)
+ bch_mark_cache_miss_collision(s->iop.c, s->d);
+
+ if (s->iop.bio) {
+ int i;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, s->iop.bio, i)
+ __free_page(bv->bv_page);
+ }
+
+ cached_dev_bio_complete(cl);
+}
+
+static void cached_dev_read_error(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct bio *bio = &s->bio.bio;
+
+ if (s->recoverable) {
+ /* Retry from the backing device: */
+ trace_bcache_read_retry(s->orig_bio);
+
+ s->iop.error = 0;
+ do_bio_hook(s, s->orig_bio);
+
+ /* XXX: invalidate cache */
+
+ closure_bio_submit(bio, cl, s->d);
+ }
+
+ continue_at(cl, cached_dev_cache_miss_done, NULL);
+}
+
+static void cached_dev_read_done(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+ /*
+ * We had a cache miss; cache_bio now contains data ready to be inserted
+ * into the cache.
+ *
+ * First, we copy the data we just read from cache_bio's bounce buffers
+ * to the buffers the original bio pointed to:
+ */
+
+ if (s->iop.bio) {
+ bio_reset(s->iop.bio);
+ s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
+ s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
+ s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
+ bch_bio_map(s->iop.bio, NULL);
+
+ bio_copy_data(s->cache_miss, s->iop.bio);
+
+ bio_put(s->cache_miss);
+ s->cache_miss = NULL;
+ }
+
+ if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
+ bch_data_verify(dc, s->orig_bio);
+
+ bio_complete(s);
+
+ if (s->iop.bio &&
+ !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
+ BUG_ON(!s->iop.replace);
+ closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+ }
+
+ continue_at(cl, cached_dev_cache_miss_done, NULL);
+}
+
+static void cached_dev_read_done_bh(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+ bch_mark_cache_accounting(s->iop.c, s->d,
+ !s->cache_miss, s->iop.bypass);
+ trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
+
+ if (s->iop.error)
+ continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
+ else if (s->iop.bio || verify(dc, &s->bio.bio))
+ continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
+ else
+ continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
+}
+
+static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ struct bio *bio, unsigned sectors)
+{
+ int ret = MAP_CONTINUE;
+ unsigned reada = 0;
+ struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+ struct bio *miss, *cache_bio;
+
+ if (s->cache_miss || s->iop.bypass) {
+ miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+ ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
+ goto out_submit;
+ }
+
+ if (!(bio->bi_rw & REQ_RAHEAD) &&
+ !(bio->bi_rw & REQ_META) &&
+ s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
+ reada = min_t(sector_t, dc->readahead >> 9,
+ bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
+
+ s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
+
+ s->iop.replace_key = KEY(s->iop.inode,
+ bio->bi_iter.bi_sector + s->insert_bio_sectors,
+ s->insert_bio_sectors);
+
+ ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
+ if (ret)
+ return ret;
+
+ s->iop.replace = true;
+
+ miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+
+ /* btree_search_recurse()'s btree iterator is no good anymore */
+ ret = miss == bio ? MAP_DONE : -EINTR;
+
+ cache_bio = bio_alloc_bioset(GFP_NOWAIT,
+ DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
+ dc->disk.bio_split);
+ if (!cache_bio)
+ goto out_submit;
+
+ cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector;
+ cache_bio->bi_bdev = miss->bi_bdev;
+ cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
+
+ cache_bio->bi_end_io = request_endio;
+ cache_bio->bi_private = &s->cl;
+
+ bch_bio_map(cache_bio, NULL);
+ if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
+ goto out_put;
+
+ if (reada)
+ bch_mark_cache_readahead(s->iop.c, s->d);
+
+ s->cache_miss = miss;
+ s->iop.bio = cache_bio;
+ bio_get(cache_bio);
+ closure_bio_submit(cache_bio, &s->cl, s->d);
+
+ return ret;
+out_put:
+ bio_put(cache_bio);
+out_submit:
+ miss->bi_end_io = request_endio;
+ miss->bi_private = &s->cl;
+ closure_bio_submit(miss, &s->cl, s->d);
+ return ret;
+}
+
+static void cached_dev_read(struct cached_dev *dc, struct search *s)
+{
+ struct closure *cl = &s->cl;
+
+ closure_call(&s->iop.cl, cache_lookup, NULL, cl);
+ continue_at(cl, cached_dev_read_done_bh, NULL);
+}
+
+/* Process writes */
+
+static void cached_dev_write_complete(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+ up_read_non_owner(&dc->writeback_lock);
+ cached_dev_bio_complete(cl);
+}
+
+static void cached_dev_write(struct cached_dev *dc, struct search *s)
+{
+ struct closure *cl = &s->cl;
+ struct bio *bio = &s->bio.bio;
+ struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
+ struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
+
+ bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
+
+ down_read_non_owner(&dc->writeback_lock);
+ if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
+ /*
+ * We overlap with some dirty data undergoing background
+ * writeback, force this write to writeback
+ */
+ s->iop.bypass = false;
+ s->iop.writeback = true;
+ }
+
+ /*
+ * Discards aren't _required_ to do anything, so skipping if
+ * check_overlapping returned true is ok
+ *
+ * But check_overlapping drops dirty keys for which io hasn't started,
+ * so we still want to call it.
+ */
+ if (bio->bi_rw & REQ_DISCARD)
+ s->iop.bypass = true;
+
+ if (should_writeback(dc, s->orig_bio,
+ cache_mode(dc, bio),
+ s->iop.bypass)) {
+ s->iop.bypass = false;
+ s->iop.writeback = true;
+ }
+
+ if (s->iop.bypass) {
+ s->iop.bio = s->orig_bio;
+ bio_get(s->iop.bio);
+
+ if (!(bio->bi_rw & REQ_DISCARD) ||
+ blk_queue_discard(bdev_get_queue(dc->bdev)))
+ closure_bio_submit(bio, cl, s->d);
+ } else if (s->iop.writeback) {
+ bch_writeback_add(dc);
+ s->iop.bio = bio;
+
+ if (bio->bi_rw & REQ_FLUSH) {
+ /* Also need to send a flush to the backing device */
+ struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
+ dc->disk.bio_split);
+
+ flush->bi_rw = WRITE_FLUSH;
+ flush->bi_bdev = bio->bi_bdev;
+ flush->bi_end_io = request_endio;
+ flush->bi_private = cl;
+
+ closure_bio_submit(flush, cl, s->d);
+ }
+ } else {
+ s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+
+ closure_bio_submit(bio, cl, s->d);
+ }
+
+ closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+ continue_at(cl, cached_dev_write_complete, NULL);
+}
+
+static void cached_dev_nodata(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct bio *bio = &s->bio.bio;
+
+ if (s->iop.flush_journal)
+ bch_journal_meta(s->iop.c, cl);
+
+ /* If it's a flush, we send the flush to the backing device too */
+ closure_bio_submit(bio, cl, s->d);
+
+ continue_at(cl, cached_dev_bio_complete, NULL);
+}
+
+/* Cached devices - read & write stuff */
+
+static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct search *s;
+ struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ int rw = bio_data_dir(bio);
+
+ generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
+
+ bio->bi_bdev = dc->bdev;
+ bio->bi_iter.bi_sector += dc->sb.data_offset;
+
+ if (cached_dev_get(dc)) {
+ s = search_alloc(bio, d);
+ trace_bcache_request_start(s->d, bio);
+
+ if (!bio->bi_iter.bi_size) {
+ /*
+ * can't call bch_journal_meta from under
+ * generic_make_request
+ */
+ continue_at_nobarrier(&s->cl,
+ cached_dev_nodata,
+ bcache_wq);
+ } else {
+ s->iop.bypass = check_should_bypass(dc, bio);
+
+ if (rw)
+ cached_dev_write(dc, s);
+ else
+ cached_dev_read(dc, s);
+ }
+ } else {
+ if ((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(dc->bdev)))
+ bio_endio(bio, 0);
+ else
+ bch_generic_make_request(bio, &d->bio_split_hook);
+ }
+}
+
+static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
+}
+
+static int cached_dev_congested(void *data, int bits)
+{
+ struct bcache_device *d = data;
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ struct request_queue *q = bdev_get_queue(dc->bdev);
+ int ret = 0;
+
+ if (bdi_congested(&q->backing_dev_info, bits))
+ return 1;
+
+ if (cached_dev_get(dc)) {
+ unsigned i;
+ struct cache *ca;
+
+ for_each_cache(ca, d->c, i) {
+ q = bdev_get_queue(ca->bdev);
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+
+ cached_dev_put(dc);
+ }
+
+ return ret;
+}
+
+void bch_cached_dev_request_init(struct cached_dev *dc)
+{
+ struct gendisk *g = dc->disk.disk;
+
+ g->queue->make_request_fn = cached_dev_make_request;
+ g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+ dc->disk.cache_miss = cached_dev_cache_miss;
+ dc->disk.ioctl = cached_dev_ioctl;
+}
+
+/* Flash backed devices */
+
+static int flash_dev_cache_miss(struct btree *b, struct search *s,
+ struct bio *bio, unsigned sectors)
+{
+ unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
+
+ swap(bio->bi_iter.bi_size, bytes);
+ zero_fill_bio(bio);
+ swap(bio->bi_iter.bi_size, bytes);
+
+ bio_advance(bio, bytes);
+
+ if (!bio->bi_iter.bi_size)
+ return MAP_DONE;
+
+ return MAP_CONTINUE;
+}
+
+static void flash_dev_nodata(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+
+ if (s->iop.flush_journal)
+ bch_journal_meta(s->iop.c, cl);
+
+ continue_at(cl, search_free, NULL);
+}
+
+static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct search *s;
+ struct closure *cl;
+ struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+ int rw = bio_data_dir(bio);
+
+ generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
+
+ s = search_alloc(bio, d);
+ cl = &s->cl;
+ bio = &s->bio.bio;
+
+ trace_bcache_request_start(s->d, bio);
+
+ if (!bio->bi_iter.bi_size) {
+ /*
+ * can't call bch_journal_meta from under
+ * generic_make_request
+ */
+ continue_at_nobarrier(&s->cl,
+ flash_dev_nodata,
+ bcache_wq);
+ } else if (rw) {
+ bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
+ &KEY(d->id, bio->bi_iter.bi_sector, 0),
+ &KEY(d->id, bio_end_sector(bio), 0));
+
+ s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0;
+ s->iop.writeback = true;
+ s->iop.bio = bio;
+
+ closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+ } else {
+ closure_call(&s->iop.cl, cache_lookup, NULL, cl);
+ }
+
+ continue_at(cl, search_free, NULL);
+}
+
+static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ return -ENOTTY;
+}
+
+static int flash_dev_congested(void *data, int bits)
+{
+ struct bcache_device *d = data;
+ struct request_queue *q;
+ struct cache *ca;
+ unsigned i;
+ int ret = 0;
+
+ for_each_cache(ca, d->c, i) {
+ q = bdev_get_queue(ca->bdev);
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+
+ return ret;
+}
+
+void bch_flash_dev_request_init(struct bcache_device *d)
+{
+ struct gendisk *g = d->disk;
+
+ g->queue->make_request_fn = flash_dev_make_request;
+ g->queue->backing_dev_info.congested_fn = flash_dev_congested;
+ d->cache_miss = flash_dev_cache_miss;
+ d->ioctl = flash_dev_ioctl;
+}
+
+void bch_request_exit(void)
+{
+ if (bch_search_cache)
+ kmem_cache_destroy(bch_search_cache);
+}
+
+int __init bch_request_init(void)
+{
+ bch_search_cache = KMEM_CACHE(search, 0);
+ if (!bch_search_cache)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
new file mode 100644
index 000000000..1ff36875c
--- /dev/null
+++ b/drivers/md/bcache/request.h
@@ -0,0 +1,43 @@
+#ifndef _BCACHE_REQUEST_H_
+#define _BCACHE_REQUEST_H_
+
+struct data_insert_op {
+ struct closure cl;
+ struct cache_set *c;
+ struct bio *bio;
+ struct workqueue_struct *wq;
+
+ unsigned inode;
+ uint16_t write_point;
+ uint16_t write_prio;
+ short error;
+
+ union {
+ uint16_t flags;
+
+ struct {
+ unsigned bypass:1;
+ unsigned writeback:1;
+ unsigned flush_journal:1;
+ unsigned csum:1;
+
+ unsigned replace:1;
+ unsigned replace_collision:1;
+
+ unsigned insert_data_done:1;
+ };
+ };
+
+ struct keylist insert_keys;
+ BKEY_PADDED(replace_key);
+};
+
+unsigned bch_get_congested(struct cache_set *);
+void bch_data_insert(struct closure *cl);
+
+void bch_cached_dev_request_init(struct cached_dev *dc);
+void bch_flash_dev_request_init(struct bcache_device *d);
+
+extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
+
+#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
new file mode 100644
index 000000000..0ca072c20
--- /dev/null
+++ b/drivers/md/bcache/stats.c
@@ -0,0 +1,241 @@
+/*
+ * bcache stats code
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "stats.h"
+#include "btree.h"
+#include "sysfs.h"
+
+/*
+ * We keep absolute totals of various statistics, and addionally a set of three
+ * rolling averages.
+ *
+ * Every so often, a timer goes off and rescales the rolling averages.
+ * accounting_rescale[] is how many times the timer has to go off before we
+ * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
+ * and one day.
+ *
+ * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
+ * and accounting_weight is what we use to rescale:
+ *
+ * pow(31 / 32, 22) ~= 1/2
+ *
+ * So that we don't have to increment each set of numbers every time we (say)
+ * get a cache hit, we increment a single atomic_t in acc->collector, and when
+ * the rescale function runs it resets the atomic counter to 0 and adds its
+ * old value to each of the exported numbers.
+ *
+ * To reduce rounding error, the numbers in struct cache_stats are all
+ * stored left shifted by 16, and scaled back in the sysfs show() function.
+ */
+
+static const unsigned DAY_RESCALE = 288;
+static const unsigned HOUR_RESCALE = 12;
+static const unsigned FIVE_MINUTE_RESCALE = 1;
+static const unsigned accounting_delay = (HZ * 300) / 22;
+static const unsigned accounting_weight = 32;
+
+/* sysfs reading/writing */
+
+read_attribute(cache_hits);
+read_attribute(cache_misses);
+read_attribute(cache_bypass_hits);
+read_attribute(cache_bypass_misses);
+read_attribute(cache_hit_ratio);
+read_attribute(cache_readaheads);
+read_attribute(cache_miss_collisions);
+read_attribute(bypassed);
+
+SHOW(bch_stats)
+{
+ struct cache_stats *s =
+ container_of(kobj, struct cache_stats, kobj);
+#define var(stat) (s->stat >> 16)
+ var_print(cache_hits);
+ var_print(cache_misses);
+ var_print(cache_bypass_hits);
+ var_print(cache_bypass_misses);
+
+ sysfs_print(cache_hit_ratio,
+ DIV_SAFE(var(cache_hits) * 100,
+ var(cache_hits) + var(cache_misses)));
+
+ var_print(cache_readaheads);
+ var_print(cache_miss_collisions);
+ sysfs_hprint(bypassed, var(sectors_bypassed) << 9);
+#undef var
+ return 0;
+}
+
+STORE(bch_stats)
+{
+ return size;
+}
+
+static void bch_stats_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_stats_files[] = {
+ &sysfs_cache_hits,
+ &sysfs_cache_misses,
+ &sysfs_cache_bypass_hits,
+ &sysfs_cache_bypass_misses,
+ &sysfs_cache_hit_ratio,
+ &sysfs_cache_readaheads,
+ &sysfs_cache_miss_collisions,
+ &sysfs_bypassed,
+ NULL
+};
+static KTYPE(bch_stats);
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+ struct kobject *parent)
+{
+ int ret = kobject_add(&acc->total.kobj, parent,
+ "stats_total");
+ ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
+ "stats_five_minute");
+ ret = ret ?: kobject_add(&acc->hour.kobj, parent,
+ "stats_hour");
+ ret = ret ?: kobject_add(&acc->day.kobj, parent,
+ "stats_day");
+ return ret;
+}
+
+void bch_cache_accounting_clear(struct cache_accounting *acc)
+{
+ memset(&acc->total.cache_hits,
+ 0,
+ sizeof(unsigned long) * 7);
+}
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc)
+{
+ kobject_put(&acc->total.kobj);
+ kobject_put(&acc->five_minute.kobj);
+ kobject_put(&acc->hour.kobj);
+ kobject_put(&acc->day.kobj);
+
+ atomic_set(&acc->closing, 1);
+ if (del_timer_sync(&acc->timer))
+ closure_return(&acc->cl);
+}
+
+/* EWMA scaling */
+
+static void scale_stat(unsigned long *stat)
+{
+ *stat = ewma_add(*stat, 0, accounting_weight, 0);
+}
+
+static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
+{
+ if (++stats->rescale == rescale_at) {
+ stats->rescale = 0;
+ scale_stat(&stats->cache_hits);
+ scale_stat(&stats->cache_misses);
+ scale_stat(&stats->cache_bypass_hits);
+ scale_stat(&stats->cache_bypass_misses);
+ scale_stat(&stats->cache_readaheads);
+ scale_stat(&stats->cache_miss_collisions);
+ scale_stat(&stats->sectors_bypassed);
+ }
+}
+
+static void scale_accounting(unsigned long data)
+{
+ struct cache_accounting *acc = (struct cache_accounting *) data;
+
+#define move_stat(name) do { \
+ unsigned t = atomic_xchg(&acc->collector.name, 0); \
+ t <<= 16; \
+ acc->five_minute.name += t; \
+ acc->hour.name += t; \
+ acc->day.name += t; \
+ acc->total.name += t; \
+} while (0)
+
+ move_stat(cache_hits);
+ move_stat(cache_misses);
+ move_stat(cache_bypass_hits);
+ move_stat(cache_bypass_misses);
+ move_stat(cache_readaheads);
+ move_stat(cache_miss_collisions);
+ move_stat(sectors_bypassed);
+
+ scale_stats(&acc->total, 0);
+ scale_stats(&acc->day, DAY_RESCALE);
+ scale_stats(&acc->hour, HOUR_RESCALE);
+ scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
+
+ acc->timer.expires += accounting_delay;
+
+ if (!atomic_read(&acc->closing))
+ add_timer(&acc->timer);
+ else
+ closure_return(&acc->cl);
+}
+
+static void mark_cache_stats(struct cache_stat_collector *stats,
+ bool hit, bool bypass)
+{
+ if (!bypass)
+ if (hit)
+ atomic_inc(&stats->cache_hits);
+ else
+ atomic_inc(&stats->cache_misses);
+ else
+ if (hit)
+ atomic_inc(&stats->cache_bypass_hits);
+ else
+ atomic_inc(&stats->cache_bypass_misses);
+}
+
+void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
+ bool hit, bool bypass)
+{
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ mark_cache_stats(&dc->accounting.collector, hit, bypass);
+ mark_cache_stats(&c->accounting.collector, hit, bypass);
+}
+
+void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
+{
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ atomic_inc(&dc->accounting.collector.cache_readaheads);
+ atomic_inc(&c->accounting.collector.cache_readaheads);
+}
+
+void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
+{
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ atomic_inc(&dc->accounting.collector.cache_miss_collisions);
+ atomic_inc(&c->accounting.collector.cache_miss_collisions);
+}
+
+void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
+ int sectors)
+{
+ atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+ atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
+}
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+ struct closure *parent)
+{
+ kobject_init(&acc->total.kobj, &bch_stats_ktype);
+ kobject_init(&acc->five_minute.kobj, &bch_stats_ktype);
+ kobject_init(&acc->hour.kobj, &bch_stats_ktype);
+ kobject_init(&acc->day.kobj, &bch_stats_ktype);
+
+ closure_init(&acc->cl, parent);
+ init_timer(&acc->timer);
+ acc->timer.expires = jiffies + accounting_delay;
+ acc->timer.data = (unsigned long) acc;
+ acc->timer.function = scale_accounting;
+ add_timer(&acc->timer);
+}
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
new file mode 100644
index 000000000..adbff141c
--- /dev/null
+++ b/drivers/md/bcache/stats.h
@@ -0,0 +1,61 @@
+#ifndef _BCACHE_STATS_H_
+#define _BCACHE_STATS_H_
+
+struct cache_stat_collector {
+ atomic_t cache_hits;
+ atomic_t cache_misses;
+ atomic_t cache_bypass_hits;
+ atomic_t cache_bypass_misses;
+ atomic_t cache_readaheads;
+ atomic_t cache_miss_collisions;
+ atomic_t sectors_bypassed;
+};
+
+struct cache_stats {
+ struct kobject kobj;
+
+ unsigned long cache_hits;
+ unsigned long cache_misses;
+ unsigned long cache_bypass_hits;
+ unsigned long cache_bypass_misses;
+ unsigned long cache_readaheads;
+ unsigned long cache_miss_collisions;
+ unsigned long sectors_bypassed;
+
+ unsigned rescale;
+};
+
+struct cache_accounting {
+ struct closure cl;
+ struct timer_list timer;
+ atomic_t closing;
+
+ struct cache_stat_collector collector;
+
+ struct cache_stats total;
+ struct cache_stats five_minute;
+ struct cache_stats hour;
+ struct cache_stats day;
+};
+
+struct cache_set;
+struct cached_dev;
+struct bcache_device;
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+ struct closure *parent);
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+ struct kobject *parent);
+
+void bch_cache_accounting_clear(struct cache_accounting *acc);
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc);
+
+void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *,
+ bool, bool);
+void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *);
+void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *);
+void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int);
+
+#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
new file mode 100644
index 000000000..4dd2bb716
--- /dev/null
+++ b/drivers/md/bcache/super.c
@@ -0,0 +1,2120 @@
+/*
+ * bcache setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "request.h"
+#include "writeback.h"
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/sysfs.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+static const char bcache_magic[] = {
+ 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
+};
+
+static const char invalid_uuid[] = {
+ 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
+ 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bch_cache_modes[] = {
+ "default",
+ "writethrough",
+ "writeback",
+ "writearound",
+ "none",
+ NULL
+};
+
+static struct kobject *bcache_kobj;
+struct mutex bch_register_lock;
+LIST_HEAD(bch_cache_sets);
+static LIST_HEAD(uncached_devices);
+
+static int bcache_major;
+static DEFINE_IDA(bcache_minor);
+static wait_queue_head_t unregister_wait;
+struct workqueue_struct *bcache_wq;
+
+#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
+
+static void bio_split_pool_free(struct bio_split_pool *p)
+{
+ if (p->bio_split_hook)
+ mempool_destroy(p->bio_split_hook);
+
+ if (p->bio_split)
+ bioset_free(p->bio_split);
+}
+
+static int bio_split_pool_init(struct bio_split_pool *p)
+{
+ p->bio_split = bioset_create(4, 0);
+ if (!p->bio_split)
+ return -ENOMEM;
+
+ p->bio_split_hook = mempool_create_kmalloc_pool(4,
+ sizeof(struct bio_split_hook));
+ if (!p->bio_split_hook)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Superblock */
+
+static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+ struct page **res)
+{
+ const char *err;
+ struct cache_sb *s;
+ struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
+ unsigned i;
+
+ if (!bh)
+ return "IO error";
+
+ s = (struct cache_sb *) bh->b_data;
+
+ sb->offset = le64_to_cpu(s->offset);
+ sb->version = le64_to_cpu(s->version);
+
+ memcpy(sb->magic, s->magic, 16);
+ memcpy(sb->uuid, s->uuid, 16);
+ memcpy(sb->set_uuid, s->set_uuid, 16);
+ memcpy(sb->label, s->label, SB_LABEL_SIZE);
+
+ sb->flags = le64_to_cpu(s->flags);
+ sb->seq = le64_to_cpu(s->seq);
+ sb->last_mount = le32_to_cpu(s->last_mount);
+ sb->first_bucket = le16_to_cpu(s->first_bucket);
+ sb->keys = le16_to_cpu(s->keys);
+
+ for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
+ sb->d[i] = le64_to_cpu(s->d[i]);
+
+ pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+ sb->version, sb->flags, sb->seq, sb->keys);
+
+ err = "Not a bcache superblock";
+ if (sb->offset != SB_SECTOR)
+ goto err;
+
+ if (memcmp(sb->magic, bcache_magic, 16))
+ goto err;
+
+ err = "Too many journal buckets";
+ if (sb->keys > SB_JOURNAL_BUCKETS)
+ goto err;
+
+ err = "Bad checksum";
+ if (s->csum != csum_set(s))
+ goto err;
+
+ err = "Bad UUID";
+ if (bch_is_zero(sb->uuid, 16))
+ goto err;
+
+ sb->block_size = le16_to_cpu(s->block_size);
+
+ err = "Superblock block size smaller than device block size";
+ if (sb->block_size << 9 < bdev_logical_block_size(bdev))
+ goto err;
+
+ switch (sb->version) {
+ case BCACHE_SB_VERSION_BDEV:
+ sb->data_offset = BDEV_DATA_START_DEFAULT;
+ break;
+ case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+ sb->data_offset = le64_to_cpu(s->data_offset);
+
+ err = "Bad data offset";
+ if (sb->data_offset < BDEV_DATA_START_DEFAULT)
+ goto err;
+
+ break;
+ case BCACHE_SB_VERSION_CDEV:
+ case BCACHE_SB_VERSION_CDEV_WITH_UUID:
+ sb->nbuckets = le64_to_cpu(s->nbuckets);
+ sb->block_size = le16_to_cpu(s->block_size);
+ sb->bucket_size = le16_to_cpu(s->bucket_size);
+
+ sb->nr_in_set = le16_to_cpu(s->nr_in_set);
+ sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
+
+ err = "Too many buckets";
+ if (sb->nbuckets > LONG_MAX)
+ goto err;
+
+ err = "Not enough buckets";
+ if (sb->nbuckets < 1 << 7)
+ goto err;
+
+ err = "Bad block/bucket size";
+ if (!is_power_of_2(sb->block_size) ||
+ sb->block_size > PAGE_SECTORS ||
+ !is_power_of_2(sb->bucket_size) ||
+ sb->bucket_size < PAGE_SECTORS)
+ goto err;
+
+ err = "Invalid superblock: device too small";
+ if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
+ goto err;
+
+ err = "Bad UUID";
+ if (bch_is_zero(sb->set_uuid, 16))
+ goto err;
+
+ err = "Bad cache device number in set";
+ if (!sb->nr_in_set ||
+ sb->nr_in_set <= sb->nr_this_dev ||
+ sb->nr_in_set > MAX_CACHES_PER_SET)
+ goto err;
+
+ err = "Journal buckets not sequential";
+ for (i = 0; i < sb->keys; i++)
+ if (sb->d[i] != sb->first_bucket + i)
+ goto err;
+
+ err = "Too many journal buckets";
+ if (sb->first_bucket + sb->keys > sb->nbuckets)
+ goto err;
+
+ err = "Invalid superblock: first bucket comes before end of super";
+ if (sb->first_bucket * sb->bucket_size < 16)
+ goto err;
+
+ break;
+ default:
+ err = "Unsupported superblock version";
+ goto err;
+ }
+
+ sb->last_mount = get_seconds();
+ err = NULL;
+
+ get_page(bh->b_page);
+ *res = bh->b_page;
+err:
+ put_bh(bh);
+ return err;
+}
+
+static void write_bdev_super_endio(struct bio *bio, int error)
+{
+ struct cached_dev *dc = bio->bi_private;
+ /* XXX: error checking */
+
+ closure_put(&dc->sb_write);
+}
+
+static void __write_super(struct cache_sb *sb, struct bio *bio)
+{
+ struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+ unsigned i;
+
+ bio->bi_iter.bi_sector = SB_SECTOR;
+ bio->bi_rw = REQ_SYNC|REQ_META;
+ bio->bi_iter.bi_size = SB_SIZE;
+ bch_bio_map(bio, NULL);
+
+ out->offset = cpu_to_le64(sb->offset);
+ out->version = cpu_to_le64(sb->version);
+
+ memcpy(out->uuid, sb->uuid, 16);
+ memcpy(out->set_uuid, sb->set_uuid, 16);
+ memcpy(out->label, sb->label, SB_LABEL_SIZE);
+
+ out->flags = cpu_to_le64(sb->flags);
+ out->seq = cpu_to_le64(sb->seq);
+
+ out->last_mount = cpu_to_le32(sb->last_mount);
+ out->first_bucket = cpu_to_le16(sb->first_bucket);
+ out->keys = cpu_to_le16(sb->keys);
+
+ for (i = 0; i < sb->keys; i++)
+ out->d[i] = cpu_to_le64(sb->d[i]);
+
+ out->csum = csum_set(out);
+
+ pr_debug("ver %llu, flags %llu, seq %llu",
+ sb->version, sb->flags, sb->seq);
+
+ submit_bio(REQ_WRITE, bio);
+}
+
+static void bch_write_bdev_super_unlock(struct closure *cl)
+{
+ struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+
+ up(&dc->sb_write_mutex);
+}
+
+void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
+{
+ struct closure *cl = &dc->sb_write;
+ struct bio *bio = &dc->sb_bio;
+
+ down(&dc->sb_write_mutex);
+ closure_init(cl, parent);
+
+ bio_reset(bio);
+ bio->bi_bdev = dc->bdev;
+ bio->bi_end_io = write_bdev_super_endio;
+ bio->bi_private = dc;
+
+ closure_get(cl);
+ __write_super(&dc->sb, bio);
+
+ closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
+}
+
+static void write_super_endio(struct bio *bio, int error)
+{
+ struct cache *ca = bio->bi_private;
+
+ bch_count_io_errors(ca, error, "writing superblock");
+ closure_put(&ca->set->sb_write);
+}
+
+static void bcache_write_super_unlock(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+
+ up(&c->sb_write_mutex);
+}
+
+void bcache_write_super(struct cache_set *c)
+{
+ struct closure *cl = &c->sb_write;
+ struct cache *ca;
+ unsigned i;
+
+ down(&c->sb_write_mutex);
+ closure_init(cl, &c->cl);
+
+ c->sb.seq++;
+
+ for_each_cache(ca, c, i) {
+ struct bio *bio = &ca->sb_bio;
+
+ ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
+ ca->sb.seq = c->sb.seq;
+ ca->sb.last_mount = c->sb.last_mount;
+
+ SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
+
+ bio_reset(bio);
+ bio->bi_bdev = ca->bdev;
+ bio->bi_end_io = write_super_endio;
+ bio->bi_private = ca;
+
+ closure_get(cl);
+ __write_super(&ca->sb, bio);
+ }
+
+ closure_return_with_destructor(cl, bcache_write_super_unlock);
+}
+
+/* UUID io */
+
+static void uuid_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+
+ cache_set_err_on(error, c, "accessing uuids");
+ bch_bbio_free(bio, c);
+ closure_put(cl);
+}
+
+static void uuid_io_unlock(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+
+ up(&c->uuid_write_mutex);
+}
+
+static void uuid_io(struct cache_set *c, unsigned long rw,
+ struct bkey *k, struct closure *parent)
+{
+ struct closure *cl = &c->uuid_write;
+ struct uuid_entry *u;
+ unsigned i;
+ char buf[80];
+
+ BUG_ON(!parent);
+ down(&c->uuid_write_mutex);
+ closure_init(cl, parent);
+
+ for (i = 0; i < KEY_PTRS(k); i++) {
+ struct bio *bio = bch_bbio_alloc(c);
+
+ bio->bi_rw = REQ_SYNC|REQ_META|rw;
+ bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
+
+ bio->bi_end_io = uuid_endio;
+ bio->bi_private = cl;
+ bch_bio_map(bio, c->uuids);
+
+ bch_submit_bbio(bio, c, k, i);
+
+ if (!(rw & WRITE))
+ break;
+ }
+
+ bch_extent_to_text(buf, sizeof(buf), k);
+ pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
+
+ for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
+ if (!bch_is_zero(u->uuid, 16))
+ pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
+ u - c->uuids, u->uuid, u->label,
+ u->first_reg, u->last_reg, u->invalidated);
+
+ closure_return_with_destructor(cl, uuid_io_unlock);
+}
+
+static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
+{
+ struct bkey *k = &j->uuid_bucket;
+
+ if (__bch_btree_ptr_invalid(c, k))
+ return "bad uuid pointer";
+
+ bkey_copy(&c->uuid_bucket, k);
+ uuid_io(c, READ_SYNC, k, cl);
+
+ if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
+ struct uuid_entry_v0 *u0 = (void *) c->uuids;
+ struct uuid_entry *u1 = (void *) c->uuids;
+ int i;
+
+ closure_sync(cl);
+
+ /*
+ * Since the new uuid entry is bigger than the old, we have to
+ * convert starting at the highest memory address and work down
+ * in order to do it in place
+ */
+
+ for (i = c->nr_uuids - 1;
+ i >= 0;
+ --i) {
+ memcpy(u1[i].uuid, u0[i].uuid, 16);
+ memcpy(u1[i].label, u0[i].label, 32);
+
+ u1[i].first_reg = u0[i].first_reg;
+ u1[i].last_reg = u0[i].last_reg;
+ u1[i].invalidated = u0[i].invalidated;
+
+ u1[i].flags = 0;
+ u1[i].sectors = 0;
+ }
+ }
+
+ return NULL;
+}
+
+static int __uuid_write(struct cache_set *c)
+{
+ BKEY_PADDED(key) k;
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ lockdep_assert_held(&bch_register_lock);
+
+ if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
+ return 1;
+
+ SET_KEY_SIZE(&k.key, c->sb.bucket_size);
+ uuid_io(c, REQ_WRITE, &k.key, &cl);
+ closure_sync(&cl);
+
+ bkey_copy(&c->uuid_bucket, &k.key);
+ bkey_put(c, &k.key);
+ return 0;
+}
+
+int bch_uuid_write(struct cache_set *c)
+{
+ int ret = __uuid_write(c);
+
+ if (!ret)
+ bch_journal_meta(c, NULL);
+
+ return ret;
+}
+
+static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
+{
+ struct uuid_entry *u;
+
+ for (u = c->uuids;
+ u < c->uuids + c->nr_uuids; u++)
+ if (!memcmp(u->uuid, uuid, 16))
+ return u;
+
+ return NULL;
+}
+
+static struct uuid_entry *uuid_find_empty(struct cache_set *c)
+{
+ static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+ return uuid_find(c, zero_uuid);
+}
+
+/*
+ * Bucket priorities/gens:
+ *
+ * For each bucket, we store on disk its
+ * 8 bit gen
+ * 16 bit priority
+ *
+ * See alloc.c for an explanation of the gen. The priority is used to implement
+ * lru (and in the future other) cache replacement policies; for most purposes
+ * it's just an opaque integer.
+ *
+ * The gens and the priorities don't have a whole lot to do with each other, and
+ * it's actually the gens that must be written out at specific times - it's no
+ * big deal if the priorities don't get written, if we lose them we just reuse
+ * buckets in suboptimal order.
+ *
+ * On disk they're stored in a packed array, and in as many buckets are required
+ * to fit them all. The buckets we use to store them form a list; the journal
+ * header points to the first bucket, the first bucket points to the second
+ * bucket, et cetera.
+ *
+ * This code is used by the allocation code; periodically (whenever it runs out
+ * of buckets to allocate from) the allocation code will invalidate some
+ * buckets, but it can't use those buckets until their new gens are safely on
+ * disk.
+ */
+
+static void prio_endio(struct bio *bio, int error)
+{
+ struct cache *ca = bio->bi_private;
+
+ cache_set_err_on(error, ca->set, "accessing priorities");
+ bch_bbio_free(bio, ca->set);
+ closure_put(&ca->prio);
+}
+
+static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
+{
+ struct closure *cl = &ca->prio;
+ struct bio *bio = bch_bbio_alloc(ca->set);
+
+ closure_init_stack(cl);
+
+ bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
+ bio->bi_bdev = ca->bdev;
+ bio->bi_rw = REQ_SYNC|REQ_META|rw;
+ bio->bi_iter.bi_size = bucket_bytes(ca);
+
+ bio->bi_end_io = prio_endio;
+ bio->bi_private = ca;
+ bch_bio_map(bio, ca->disk_buckets);
+
+ closure_bio_submit(bio, &ca->prio, ca);
+ closure_sync(cl);
+}
+
+void bch_prio_write(struct cache *ca)
+{
+ int i;
+ struct bucket *b;
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ lockdep_assert_held(&ca->set->bucket_lock);
+
+ ca->disk_buckets->seq++;
+
+ atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
+ &ca->meta_sectors_written);
+
+ //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+ // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+
+ for (i = prio_buckets(ca) - 1; i >= 0; --i) {
+ long bucket;
+ struct prio_set *p = ca->disk_buckets;
+ struct bucket_disk *d = p->data;
+ struct bucket_disk *end = d + prios_per_bucket(ca);
+
+ for (b = ca->buckets + i * prios_per_bucket(ca);
+ b < ca->buckets + ca->sb.nbuckets && d < end;
+ b++, d++) {
+ d->prio = cpu_to_le16(b->prio);
+ d->gen = b->gen;
+ }
+
+ p->next_bucket = ca->prio_buckets[i + 1];
+ p->magic = pset_magic(&ca->sb);
+ p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
+
+ bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
+ BUG_ON(bucket == -1);
+
+ mutex_unlock(&ca->set->bucket_lock);
+ prio_io(ca, bucket, REQ_WRITE);
+ mutex_lock(&ca->set->bucket_lock);
+
+ ca->prio_buckets[i] = bucket;
+ atomic_dec_bug(&ca->buckets[bucket].pin);
+ }
+
+ mutex_unlock(&ca->set->bucket_lock);
+
+ bch_journal_meta(ca->set, &cl);
+ closure_sync(&cl);
+
+ mutex_lock(&ca->set->bucket_lock);
+
+ /*
+ * Don't want the old priorities to get garbage collected until after we
+ * finish writing the new ones, and they're journalled
+ */
+ for (i = 0; i < prio_buckets(ca); i++) {
+ if (ca->prio_last_buckets[i])
+ __bch_bucket_free(ca,
+ &ca->buckets[ca->prio_last_buckets[i]]);
+
+ ca->prio_last_buckets[i] = ca->prio_buckets[i];
+ }
+}
+
+static void prio_read(struct cache *ca, uint64_t bucket)
+{
+ struct prio_set *p = ca->disk_buckets;
+ struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
+ struct bucket *b;
+ unsigned bucket_nr = 0;
+
+ for (b = ca->buckets;
+ b < ca->buckets + ca->sb.nbuckets;
+ b++, d++) {
+ if (d == end) {
+ ca->prio_buckets[bucket_nr] = bucket;
+ ca->prio_last_buckets[bucket_nr] = bucket;
+ bucket_nr++;
+
+ prio_io(ca, bucket, READ_SYNC);
+
+ if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
+ pr_warn("bad csum reading priorities");
+
+ if (p->magic != pset_magic(&ca->sb))
+ pr_warn("bad magic reading priorities");
+
+ bucket = p->next_bucket;
+ d = p->data;
+ }
+
+ b->prio = le16_to_cpu(d->prio);
+ b->gen = b->last_gc = d->gen;
+ }
+}
+
+/* Bcache device */
+
+static int open_dev(struct block_device *b, fmode_t mode)
+{
+ struct bcache_device *d = b->bd_disk->private_data;
+ if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
+ return -ENXIO;
+
+ closure_get(&d->cl);
+ return 0;
+}
+
+static void release_dev(struct gendisk *b, fmode_t mode)
+{
+ struct bcache_device *d = b->private_data;
+ closure_put(&d->cl);
+}
+
+static int ioctl_dev(struct block_device *b, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct bcache_device *d = b->bd_disk->private_data;
+ return d->ioctl(d, mode, cmd, arg);
+}
+
+static const struct block_device_operations bcache_ops = {
+ .open = open_dev,
+ .release = release_dev,
+ .ioctl = ioctl_dev,
+ .owner = THIS_MODULE,
+};
+
+void bcache_device_stop(struct bcache_device *d)
+{
+ if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
+ closure_queue(&d->cl);
+}
+
+static void bcache_device_unlink(struct bcache_device *d)
+{
+ lockdep_assert_held(&bch_register_lock);
+
+ if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
+ unsigned i;
+ struct cache *ca;
+
+ sysfs_remove_link(&d->c->kobj, d->name);
+ sysfs_remove_link(&d->kobj, "cache");
+
+ for_each_cache(ca, d->c, i)
+ bd_unlink_disk_holder(ca->bdev, d->disk);
+ }
+}
+
+static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
+ const char *name)
+{
+ unsigned i;
+ struct cache *ca;
+
+ for_each_cache(ca, d->c, i)
+ bd_link_disk_holder(ca->bdev, d->disk);
+
+ snprintf(d->name, BCACHEDEVNAME_SIZE,
+ "%s%u", name, d->id);
+
+ WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
+ sysfs_create_link(&c->kobj, &d->kobj, d->name),
+ "Couldn't create device <-> cache set symlinks");
+}
+
+static void bcache_device_detach(struct bcache_device *d)
+{
+ lockdep_assert_held(&bch_register_lock);
+
+ if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
+ struct uuid_entry *u = d->c->uuids + d->id;
+
+ SET_UUID_FLASH_ONLY(u, 0);
+ memcpy(u->uuid, invalid_uuid, 16);
+ u->invalidated = cpu_to_le32(get_seconds());
+ bch_uuid_write(d->c);
+ }
+
+ bcache_device_unlink(d);
+
+ d->c->devices[d->id] = NULL;
+ closure_put(&d->c->caching);
+ d->c = NULL;
+}
+
+static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
+ unsigned id)
+{
+ d->id = id;
+ d->c = c;
+ c->devices[id] = d;
+
+ closure_get(&c->caching);
+}
+
+static void bcache_device_free(struct bcache_device *d)
+{
+ lockdep_assert_held(&bch_register_lock);
+
+ pr_info("%s stopped", d->disk->disk_name);
+
+ if (d->c)
+ bcache_device_detach(d);
+ if (d->disk && d->disk->flags & GENHD_FL_UP)
+ del_gendisk(d->disk);
+ if (d->disk && d->disk->queue)
+ blk_cleanup_queue(d->disk->queue);
+ if (d->disk) {
+ ida_simple_remove(&bcache_minor, d->disk->first_minor);
+ put_disk(d->disk);
+ }
+
+ bio_split_pool_free(&d->bio_split_hook);
+ if (d->bio_split)
+ bioset_free(d->bio_split);
+ if (is_vmalloc_addr(d->full_dirty_stripes))
+ vfree(d->full_dirty_stripes);
+ else
+ kfree(d->full_dirty_stripes);
+ if (is_vmalloc_addr(d->stripe_sectors_dirty))
+ vfree(d->stripe_sectors_dirty);
+ else
+ kfree(d->stripe_sectors_dirty);
+
+ closure_debug_destroy(&d->cl);
+}
+
+static int bcache_device_init(struct bcache_device *d, unsigned block_size,
+ sector_t sectors)
+{
+ struct request_queue *q;
+ size_t n;
+ int minor;
+
+ if (!d->stripe_size)
+ d->stripe_size = 1 << 31;
+
+ d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
+
+ if (!d->nr_stripes ||
+ d->nr_stripes > INT_MAX ||
+ d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
+ pr_err("nr_stripes too large");
+ return -ENOMEM;
+ }
+
+ n = d->nr_stripes * sizeof(atomic_t);
+ d->stripe_sectors_dirty = n < PAGE_SIZE << 6
+ ? kzalloc(n, GFP_KERNEL)
+ : vzalloc(n);
+ if (!d->stripe_sectors_dirty)
+ return -ENOMEM;
+
+ n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
+ d->full_dirty_stripes = n < PAGE_SIZE << 6
+ ? kzalloc(n, GFP_KERNEL)
+ : vzalloc(n);
+ if (!d->full_dirty_stripes)
+ return -ENOMEM;
+
+ minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
+ if (minor < 0)
+ return minor;
+
+ if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+ bio_split_pool_init(&d->bio_split_hook) ||
+ !(d->disk = alloc_disk(1))) {
+ ida_simple_remove(&bcache_minor, minor);
+ return -ENOMEM;
+ }
+
+ set_capacity(d->disk, sectors);
+ snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
+
+ d->disk->major = bcache_major;
+ d->disk->first_minor = minor;
+ d->disk->fops = &bcache_ops;
+ d->disk->private_data = d;
+
+ q = blk_alloc_queue(GFP_KERNEL);
+ if (!q)
+ return -ENOMEM;
+
+ blk_queue_make_request(q, NULL);
+ d->disk->queue = q;
+ q->queuedata = d;
+ q->backing_dev_info.congested_data = d;
+ q->limits.max_hw_sectors = UINT_MAX;
+ q->limits.max_sectors = UINT_MAX;
+ q->limits.max_segment_size = UINT_MAX;
+ q->limits.max_segments = BIO_MAX_PAGES;
+ q->limits.max_discard_sectors = UINT_MAX;
+ q->limits.discard_granularity = 512;
+ q->limits.io_min = block_size;
+ q->limits.logical_block_size = block_size;
+ q->limits.physical_block_size = block_size;
+ set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
+ clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
+ set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
+
+ blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
+
+ return 0;
+}
+
+/* Cached device */
+
+static void calc_cached_dev_sectors(struct cache_set *c)
+{
+ uint64_t sectors = 0;
+ struct cached_dev *dc;
+
+ list_for_each_entry(dc, &c->cached_devs, list)
+ sectors += bdev_sectors(dc->bdev);
+
+ c->cached_dev_sectors = sectors;
+}
+
+void bch_cached_dev_run(struct cached_dev *dc)
+{
+ struct bcache_device *d = &dc->disk;
+ char buf[SB_LABEL_SIZE + 1];
+ char *env[] = {
+ "DRIVER=bcache",
+ kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
+ NULL,
+ NULL,
+ };
+
+ memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
+ buf[SB_LABEL_SIZE] = '\0';
+ env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
+
+ if (atomic_xchg(&dc->running, 1))
+ return;
+
+ if (!d->c &&
+ BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
+ bch_write_bdev_super(dc, &cl);
+ closure_sync(&cl);
+ }
+
+ add_disk(d->disk);
+ bd_link_disk_holder(dc->bdev, dc->disk.disk);
+ /* won't show up in the uevent file, use udevadm monitor -e instead
+ * only class / kset properties are persistent */
+ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
+ kfree(env[1]);
+ kfree(env[2]);
+
+ if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+ sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+ pr_debug("error creating sysfs link");
+}
+
+static void cached_dev_detach_finish(struct work_struct *w)
+{
+ struct cached_dev *dc = container_of(w, struct cached_dev, detach);
+ char buf[BDEVNAME_SIZE];
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
+ BUG_ON(atomic_read(&dc->count));
+
+ mutex_lock(&bch_register_lock);
+
+ memset(&dc->sb.set_uuid, 0, 16);
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
+
+ bch_write_bdev_super(dc, &cl);
+ closure_sync(&cl);
+
+ bcache_device_detach(&dc->disk);
+ list_move(&dc->list, &uncached_devices);
+
+ clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+ clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
+
+ mutex_unlock(&bch_register_lock);
+
+ pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
+
+ /* Drop ref we took in cached_dev_detach() */
+ closure_put(&dc->disk.cl);
+}
+
+void bch_cached_dev_detach(struct cached_dev *dc)
+{
+ lockdep_assert_held(&bch_register_lock);
+
+ if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
+ return;
+
+ if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+ return;
+
+ /*
+ * Block the device from being closed and freed until we're finished
+ * detaching
+ */
+ closure_get(&dc->disk.cl);
+
+ bch_writeback_queue(dc);
+ cached_dev_put(dc);
+}
+
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+{
+ uint32_t rtime = cpu_to_le32(get_seconds());
+ struct uuid_entry *u;
+ char buf[BDEVNAME_SIZE];
+
+ bdevname(dc->bdev, buf);
+
+ if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
+ return -ENOENT;
+
+ if (dc->disk.c) {
+ pr_err("Can't attach %s: already attached", buf);
+ return -EINVAL;
+ }
+
+ if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
+ pr_err("Can't attach %s: shutting down", buf);
+ return -EINVAL;
+ }
+
+ if (dc->sb.block_size < c->sb.block_size) {
+ /* Will die */
+ pr_err("Couldn't attach %s: block size less than set's block size",
+ buf);
+ return -EINVAL;
+ }
+
+ u = uuid_find(c, dc->sb.uuid);
+
+ if (u &&
+ (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
+ BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
+ memcpy(u->uuid, invalid_uuid, 16);
+ u->invalidated = cpu_to_le32(get_seconds());
+ u = NULL;
+ }
+
+ if (!u) {
+ if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+ pr_err("Couldn't find uuid for %s in set", buf);
+ return -ENOENT;
+ }
+
+ u = uuid_find_empty(c);
+ if (!u) {
+ pr_err("Not caching %s, no room for UUID", buf);
+ return -EINVAL;
+ }
+ }
+
+ /* Deadlocks since we're called via sysfs...
+ sysfs_remove_file(&dc->kobj, &sysfs_attach);
+ */
+
+ if (bch_is_zero(u->uuid, 16)) {
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ memcpy(u->uuid, dc->sb.uuid, 16);
+ memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
+ u->first_reg = u->last_reg = rtime;
+ bch_uuid_write(c);
+
+ memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+
+ bch_write_bdev_super(dc, &cl);
+ closure_sync(&cl);
+ } else {
+ u->last_reg = rtime;
+ bch_uuid_write(c);
+ }
+
+ bcache_device_attach(&dc->disk, c, u - c->uuids);
+ list_move(&dc->list, &c->cached_devs);
+ calc_cached_dev_sectors(c);
+
+ smp_wmb();
+ /*
+ * dc->c must be set before dc->count != 0 - paired with the mb in
+ * cached_dev_get()
+ */
+ atomic_set(&dc->count, 1);
+
+ if (bch_cached_dev_writeback_start(dc))
+ return -ENOMEM;
+
+ if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+ bch_sectors_dirty_init(dc);
+ atomic_set(&dc->has_dirty, 1);
+ atomic_inc(&dc->count);
+ bch_writeback_queue(dc);
+ }
+
+ bch_cached_dev_run(dc);
+ bcache_device_link(&dc->disk, c, "bdev");
+
+ pr_info("Caching %s as %s on set %pU",
+ bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
+ dc->disk.c->sb.set_uuid);
+ return 0;
+}
+
+void bch_cached_dev_release(struct kobject *kobj)
+{
+ struct cached_dev *dc = container_of(kobj, struct cached_dev,
+ disk.kobj);
+ kfree(dc);
+ module_put(THIS_MODULE);
+}
+
+static void cached_dev_free(struct closure *cl)
+{
+ struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+
+ cancel_delayed_work_sync(&dc->writeback_rate_update);
+ if (!IS_ERR_OR_NULL(dc->writeback_thread))
+ kthread_stop(dc->writeback_thread);
+
+ mutex_lock(&bch_register_lock);
+
+ if (atomic_read(&dc->running))
+ bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
+ bcache_device_free(&dc->disk);
+ list_del(&dc->list);
+
+ mutex_unlock(&bch_register_lock);
+
+ if (!IS_ERR_OR_NULL(dc->bdev))
+ blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+ wake_up(&unregister_wait);
+
+ kobject_put(&dc->disk.kobj);
+}
+
+static void cached_dev_flush(struct closure *cl)
+{
+ struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ struct bcache_device *d = &dc->disk;
+
+ mutex_lock(&bch_register_lock);
+ bcache_device_unlink(d);
+ mutex_unlock(&bch_register_lock);
+
+ bch_cache_accounting_destroy(&dc->accounting);
+ kobject_del(&d->kobj);
+
+ continue_at(cl, cached_dev_free, system_wq);
+}
+
+static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+{
+ int ret;
+ struct io *io;
+ struct request_queue *q = bdev_get_queue(dc->bdev);
+
+ __module_get(THIS_MODULE);
+ INIT_LIST_HEAD(&dc->list);
+ closure_init(&dc->disk.cl, NULL);
+ set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+ kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
+ INIT_WORK(&dc->detach, cached_dev_detach_finish);
+ sema_init(&dc->sb_write_mutex, 1);
+ INIT_LIST_HEAD(&dc->io_lru);
+ spin_lock_init(&dc->io_lock);
+ bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
+
+ dc->sequential_cutoff = 4 << 20;
+
+ for (io = dc->io; io < dc->io + RECENT_IO; io++) {
+ list_add(&io->lru, &dc->io_lru);
+ hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
+ }
+
+ dc->disk.stripe_size = q->limits.io_opt >> 9;
+
+ if (dc->disk.stripe_size)
+ dc->partial_stripes_expensive =
+ q->limits.raid_partial_stripes_expensive;
+
+ ret = bcache_device_init(&dc->disk, block_size,
+ dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
+ if (ret)
+ return ret;
+
+ set_capacity(dc->disk.disk,
+ dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
+
+ dc->disk.disk->queue->backing_dev_info.ra_pages =
+ max(dc->disk.disk->queue->backing_dev_info.ra_pages,
+ q->backing_dev_info.ra_pages);
+
+ bch_cached_dev_request_init(dc);
+ bch_cached_dev_writeback_init(dc);
+ return 0;
+}
+
+/* Cached device - bcache superblock */
+
+static void register_bdev(struct cache_sb *sb, struct page *sb_page,
+ struct block_device *bdev,
+ struct cached_dev *dc)
+{
+ char name[BDEVNAME_SIZE];
+ const char *err = "cannot allocate memory";
+ struct cache_set *c;
+
+ memcpy(&dc->sb, sb, sizeof(struct cache_sb));
+ dc->bdev = bdev;
+ dc->bdev->bd_holder = dc;
+
+ bio_init(&dc->sb_bio);
+ dc->sb_bio.bi_max_vecs = 1;
+ dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
+ dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ get_page(sb_page);
+
+ if (cached_dev_init(dc, sb->block_size << 9))
+ goto err;
+
+ err = "error creating kobject";
+ if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
+ "bcache"))
+ goto err;
+ if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
+ goto err;
+
+ pr_info("registered backing device %s", bdevname(bdev, name));
+
+ list_add(&dc->list, &uncached_devices);
+ list_for_each_entry(c, &bch_cache_sets, list)
+ bch_cached_dev_attach(dc, c);
+
+ if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
+ BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
+ bch_cached_dev_run(dc);
+
+ return;
+err:
+ pr_notice("error opening %s: %s", bdevname(bdev, name), err);
+ bcache_device_stop(&dc->disk);
+}
+
+/* Flash only volumes */
+
+void bch_flash_dev_release(struct kobject *kobj)
+{
+ struct bcache_device *d = container_of(kobj, struct bcache_device,
+ kobj);
+ kfree(d);
+}
+
+static void flash_dev_free(struct closure *cl)
+{
+ struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ mutex_lock(&bch_register_lock);
+ bcache_device_free(d);
+ mutex_unlock(&bch_register_lock);
+ kobject_put(&d->kobj);
+}
+
+static void flash_dev_flush(struct closure *cl)
+{
+ struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
+ mutex_lock(&bch_register_lock);
+ bcache_device_unlink(d);
+ mutex_unlock(&bch_register_lock);
+ kobject_del(&d->kobj);
+ continue_at(cl, flash_dev_free, system_wq);
+}
+
+static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
+{
+ struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
+ GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ closure_init(&d->cl, NULL);
+ set_closure_fn(&d->cl, flash_dev_flush, system_wq);
+
+ kobject_init(&d->kobj, &bch_flash_dev_ktype);
+
+ if (bcache_device_init(d, block_bytes(c), u->sectors))
+ goto err;
+
+ bcache_device_attach(d, c, u - c->uuids);
+ bch_flash_dev_request_init(d);
+ add_disk(d->disk);
+
+ if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+ goto err;
+
+ bcache_device_link(d, c, "volume");
+
+ return 0;
+err:
+ kobject_put(&d->kobj);
+ return -ENOMEM;
+}
+
+static int flash_devs_run(struct cache_set *c)
+{
+ int ret = 0;
+ struct uuid_entry *u;
+
+ for (u = c->uuids;
+ u < c->uuids + c->nr_uuids && !ret;
+ u++)
+ if (UUID_FLASH_ONLY(u))
+ ret = flash_dev_run(c, u);
+
+ return ret;
+}
+
+int bch_flash_dev_create(struct cache_set *c, uint64_t size)
+{
+ struct uuid_entry *u;
+
+ if (test_bit(CACHE_SET_STOPPING, &c->flags))
+ return -EINTR;
+
+ if (!test_bit(CACHE_SET_RUNNING, &c->flags))
+ return -EPERM;
+
+ u = uuid_find_empty(c);
+ if (!u) {
+ pr_err("Can't create volume, no room for UUID");
+ return -EINVAL;
+ }
+
+ get_random_bytes(u->uuid, 16);
+ memset(u->label, 0, 32);
+ u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+
+ SET_UUID_FLASH_ONLY(u, 1);
+ u->sectors = size >> 9;
+
+ bch_uuid_write(c);
+
+ return flash_dev_run(c, u);
+}
+
+/* Cache set */
+
+__printf(2, 3)
+bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
+{
+ va_list args;
+
+ if (c->on_error != ON_ERROR_PANIC &&
+ test_bit(CACHE_SET_STOPPING, &c->flags))
+ return false;
+
+ /* XXX: we can be called from atomic context
+ acquire_console_sem();
+ */
+
+ printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ printk(", disabling caching\n");
+
+ if (c->on_error == ON_ERROR_PANIC)
+ panic("panic forced after error\n");
+
+ bch_cache_set_unregister(c);
+ return true;
+}
+
+void bch_cache_set_release(struct kobject *kobj)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+ kfree(c);
+ module_put(THIS_MODULE);
+}
+
+static void cache_set_free(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, cl);
+ struct cache *ca;
+ unsigned i;
+
+ if (!IS_ERR_OR_NULL(c->debug))
+ debugfs_remove(c->debug);
+
+ bch_open_buckets_free(c);
+ bch_btree_cache_free(c);
+ bch_journal_free(c);
+
+ for_each_cache(ca, c, i)
+ if (ca) {
+ ca->set = NULL;
+ c->cache[ca->sb.nr_this_dev] = NULL;
+ kobject_put(&ca->kobj);
+ }
+
+ bch_bset_sort_state_free(&c->sort);
+ free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
+
+ if (c->moving_gc_wq)
+ destroy_workqueue(c->moving_gc_wq);
+ if (c->bio_split)
+ bioset_free(c->bio_split);
+ if (c->fill_iter)
+ mempool_destroy(c->fill_iter);
+ if (c->bio_meta)
+ mempool_destroy(c->bio_meta);
+ if (c->search)
+ mempool_destroy(c->search);
+ kfree(c->devices);
+
+ mutex_lock(&bch_register_lock);
+ list_del(&c->list);
+ mutex_unlock(&bch_register_lock);
+
+ pr_info("Cache set %pU unregistered", c->sb.set_uuid);
+ wake_up(&unregister_wait);
+
+ closure_debug_destroy(&c->cl);
+ kobject_put(&c->kobj);
+}
+
+static void cache_set_flush(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, caching);
+ struct cache *ca;
+ struct btree *b;
+ unsigned i;
+
+ bch_cache_accounting_destroy(&c->accounting);
+
+ kobject_put(&c->internal);
+ kobject_del(&c->kobj);
+
+ if (c->gc_thread)
+ kthread_stop(c->gc_thread);
+
+ if (!IS_ERR_OR_NULL(c->root))
+ list_add(&c->root->list, &c->btree_cache);
+
+ /* Should skip this if we're unregistering because of an error */
+ list_for_each_entry(b, &c->btree_cache, list) {
+ mutex_lock(&b->write_lock);
+ if (btree_node_dirty(b))
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
+ }
+
+ for_each_cache(ca, c, i)
+ if (ca->alloc_thread)
+ kthread_stop(ca->alloc_thread);
+
+ if (c->journal.cur) {
+ cancel_delayed_work_sync(&c->journal.work);
+ /* flush last journal entry if needed */
+ c->journal.work.work.func(&c->journal.work.work);
+ }
+
+ closure_return(cl);
+}
+
+static void __cache_set_unregister(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, caching);
+ struct cached_dev *dc;
+ size_t i;
+
+ mutex_lock(&bch_register_lock);
+
+ for (i = 0; i < c->nr_uuids; i++)
+ if (c->devices[i]) {
+ if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+ dc = container_of(c->devices[i],
+ struct cached_dev, disk);
+ bch_cached_dev_detach(dc);
+ } else {
+ bcache_device_stop(c->devices[i]);
+ }
+ }
+
+ mutex_unlock(&bch_register_lock);
+
+ continue_at(cl, cache_set_flush, system_wq);
+}
+
+void bch_cache_set_stop(struct cache_set *c)
+{
+ if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+ closure_queue(&c->caching);
+}
+
+void bch_cache_set_unregister(struct cache_set *c)
+{
+ set_bit(CACHE_SET_UNREGISTERING, &c->flags);
+ bch_cache_set_stop(c);
+}
+
+#define alloc_bucket_pages(gfp, c) \
+ ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
+
+struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+{
+ int iter_size;
+ struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+ if (!c)
+ return NULL;
+
+ __module_get(THIS_MODULE);
+ closure_init(&c->cl, NULL);
+ set_closure_fn(&c->cl, cache_set_free, system_wq);
+
+ closure_init(&c->caching, &c->cl);
+ set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
+
+ /* Maybe create continue_at_noreturn() and use it here? */
+ closure_set_stopped(&c->cl);
+ closure_put(&c->cl);
+
+ kobject_init(&c->kobj, &bch_cache_set_ktype);
+ kobject_init(&c->internal, &bch_cache_set_internal_ktype);
+
+ bch_cache_accounting_init(&c->accounting, &c->cl);
+
+ memcpy(c->sb.set_uuid, sb->set_uuid, 16);
+ c->sb.block_size = sb->block_size;
+ c->sb.bucket_size = sb->bucket_size;
+ c->sb.nr_in_set = sb->nr_in_set;
+ c->sb.last_mount = sb->last_mount;
+ c->bucket_bits = ilog2(sb->bucket_size);
+ c->block_bits = ilog2(sb->block_size);
+ c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
+
+ c->btree_pages = bucket_pages(c);
+ if (c->btree_pages > BTREE_MAX_PAGES)
+ c->btree_pages = max_t(int, c->btree_pages / 4,
+ BTREE_MAX_PAGES);
+
+ sema_init(&c->sb_write_mutex, 1);
+ mutex_init(&c->bucket_lock);
+ init_waitqueue_head(&c->btree_cache_wait);
+ init_waitqueue_head(&c->bucket_wait);
+ sema_init(&c->uuid_write_mutex, 1);
+
+ spin_lock_init(&c->btree_gc_time.lock);
+ spin_lock_init(&c->btree_split_time.lock);
+ spin_lock_init(&c->btree_read_time.lock);
+
+ bch_moving_init_cache_set(c);
+
+ INIT_LIST_HEAD(&c->list);
+ INIT_LIST_HEAD(&c->cached_devs);
+ INIT_LIST_HEAD(&c->btree_cache);
+ INIT_LIST_HEAD(&c->btree_cache_freeable);
+ INIT_LIST_HEAD(&c->btree_cache_freed);
+ INIT_LIST_HEAD(&c->data_buckets);
+
+ c->search = mempool_create_slab_pool(32, bch_search_cache);
+ if (!c->search)
+ goto err;
+
+ iter_size = (sb->bucket_size / sb->block_size + 1) *
+ sizeof(struct btree_iter_set);
+
+ if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
+ !(c->bio_meta = mempool_create_kmalloc_pool(2,
+ sizeof(struct bbio) + sizeof(struct bio_vec) *
+ bucket_pages(c))) ||
+ !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
+ !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+ !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
+ !(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
+ bch_journal_alloc(c) ||
+ bch_btree_cache_alloc(c) ||
+ bch_open_buckets_alloc(c) ||
+ bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
+ goto err;
+
+ c->congested_read_threshold_us = 2000;
+ c->congested_write_threshold_us = 20000;
+ c->error_limit = 8 << IO_ERROR_SHIFT;
+
+ return c;
+err:
+ bch_cache_set_unregister(c);
+ return NULL;
+}
+
+static void run_cache_set(struct cache_set *c)
+{
+ const char *err = "cannot allocate memory";
+ struct cached_dev *dc, *t;
+ struct cache *ca;
+ struct closure cl;
+ unsigned i;
+
+ closure_init_stack(&cl);
+
+ for_each_cache(ca, c, i)
+ c->nbuckets += ca->sb.nbuckets;
+
+ if (CACHE_SYNC(&c->sb)) {
+ LIST_HEAD(journal);
+ struct bkey *k;
+ struct jset *j;
+
+ err = "cannot allocate memory for journal";
+ if (bch_journal_read(c, &journal))
+ goto err;
+
+ pr_debug("btree_journal_read() done");
+
+ err = "no journal entries found";
+ if (list_empty(&journal))
+ goto err;
+
+ j = &list_entry(journal.prev, struct journal_replay, list)->j;
+
+ err = "IO error reading priorities";
+ for_each_cache(ca, c, i)
+ prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
+
+ /*
+ * If prio_read() fails it'll call cache_set_error and we'll
+ * tear everything down right away, but if we perhaps checked
+ * sooner we could avoid journal replay.
+ */
+
+ k = &j->btree_root;
+
+ err = "bad btree root";
+ if (__bch_btree_ptr_invalid(c, k))
+ goto err;
+
+ err = "error reading btree root";
+ c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
+ if (IS_ERR_OR_NULL(c->root))
+ goto err;
+
+ list_del_init(&c->root->list);
+ rw_unlock(true, c->root);
+
+ err = uuid_read(c, j, &cl);
+ if (err)
+ goto err;
+
+ err = "error in recovery";
+ if (bch_btree_check(c))
+ goto err;
+
+ bch_journal_mark(c, &journal);
+ bch_initial_gc_finish(c);
+ pr_debug("btree_check() done");
+
+ /*
+ * bcache_journal_next() can't happen sooner, or
+ * btree_gc_finish() will give spurious errors about last_gc >
+ * gc_gen - this is a hack but oh well.
+ */
+ bch_journal_next(&c->journal);
+
+ err = "error starting allocator thread";
+ for_each_cache(ca, c, i)
+ if (bch_cache_allocator_start(ca))
+ goto err;
+
+ /*
+ * First place it's safe to allocate: btree_check() and
+ * btree_gc_finish() have to run before we have buckets to
+ * allocate, and bch_bucket_alloc_set() might cause a journal
+ * entry to be written so bcache_journal_next() has to be called
+ * first.
+ *
+ * If the uuids were in the old format we have to rewrite them
+ * before the next journal entry is written:
+ */
+ if (j->version < BCACHE_JSET_VERSION_UUID)
+ __uuid_write(c);
+
+ bch_journal_replay(c, &journal);
+ } else {
+ pr_notice("invalidating existing data");
+
+ for_each_cache(ca, c, i) {
+ unsigned j;
+
+ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+ 2, SB_JOURNAL_BUCKETS);
+
+ for (j = 0; j < ca->sb.keys; j++)
+ ca->sb.d[j] = ca->sb.first_bucket + j;
+ }
+
+ bch_initial_gc_finish(c);
+
+ err = "error starting allocator thread";
+ for_each_cache(ca, c, i)
+ if (bch_cache_allocator_start(ca))
+ goto err;
+
+ mutex_lock(&c->bucket_lock);
+ for_each_cache(ca, c, i)
+ bch_prio_write(ca);
+ mutex_unlock(&c->bucket_lock);
+
+ err = "cannot allocate new UUID bucket";
+ if (__uuid_write(c))
+ goto err;
+
+ err = "cannot allocate new btree root";
+ c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
+ if (IS_ERR_OR_NULL(c->root))
+ goto err;
+
+ mutex_lock(&c->root->write_lock);
+ bkey_copy_key(&c->root->key, &MAX_KEY);
+ bch_btree_node_write(c->root, &cl);
+ mutex_unlock(&c->root->write_lock);
+
+ bch_btree_set_root(c->root);
+ rw_unlock(true, c->root);
+
+ /*
+ * We don't want to write the first journal entry until
+ * everything is set up - fortunately journal entries won't be
+ * written until the SET_CACHE_SYNC() here:
+ */
+ SET_CACHE_SYNC(&c->sb, true);
+
+ bch_journal_next(&c->journal);
+ bch_journal_meta(c, &cl);
+ }
+
+ err = "error starting gc thread";
+ if (bch_gc_thread_start(c))
+ goto err;
+
+ closure_sync(&cl);
+ c->sb.last_mount = get_seconds();
+ bcache_write_super(c);
+
+ list_for_each_entry_safe(dc, t, &uncached_devices, list)
+ bch_cached_dev_attach(dc, c);
+
+ flash_devs_run(c);
+
+ set_bit(CACHE_SET_RUNNING, &c->flags);
+ return;
+err:
+ closure_sync(&cl);
+ /* XXX: test this, it's broken */
+ bch_cache_set_error(c, "%s", err);
+}
+
+static bool can_attach_cache(struct cache *ca, struct cache_set *c)
+{
+ return ca->sb.block_size == c->sb.block_size &&
+ ca->sb.bucket_size == c->sb.bucket_size &&
+ ca->sb.nr_in_set == c->sb.nr_in_set;
+}
+
+static const char *register_cache_set(struct cache *ca)
+{
+ char buf[12];
+ const char *err = "cannot allocate memory";
+ struct cache_set *c;
+
+ list_for_each_entry(c, &bch_cache_sets, list)
+ if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
+ if (c->cache[ca->sb.nr_this_dev])
+ return "duplicate cache set member";
+
+ if (!can_attach_cache(ca, c))
+ return "cache sb does not match set";
+
+ if (!CACHE_SYNC(&ca->sb))
+ SET_CACHE_SYNC(&c->sb, false);
+
+ goto found;
+ }
+
+ c = bch_cache_set_alloc(&ca->sb);
+ if (!c)
+ return err;
+
+ err = "error creating kobject";
+ if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
+ kobject_add(&c->internal, &c->kobj, "internal"))
+ goto err;
+
+ if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
+ goto err;
+
+ bch_debug_init_cache_set(c);
+
+ list_add(&c->list, &bch_cache_sets);
+found:
+ sprintf(buf, "cache%i", ca->sb.nr_this_dev);
+ if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
+ sysfs_create_link(&c->kobj, &ca->kobj, buf))
+ goto err;
+
+ if (ca->sb.seq > c->sb.seq) {
+ c->sb.version = ca->sb.version;
+ memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
+ c->sb.flags = ca->sb.flags;
+ c->sb.seq = ca->sb.seq;
+ pr_debug("set version = %llu", c->sb.version);
+ }
+
+ kobject_get(&ca->kobj);
+ ca->set = c;
+ ca->set->cache[ca->sb.nr_this_dev] = ca;
+ c->cache_by_alloc[c->caches_loaded++] = ca;
+
+ if (c->caches_loaded == c->sb.nr_in_set)
+ run_cache_set(c);
+
+ return NULL;
+err:
+ bch_cache_set_unregister(c);
+ return err;
+}
+
+/* Cache device */
+
+void bch_cache_release(struct kobject *kobj)
+{
+ struct cache *ca = container_of(kobj, struct cache, kobj);
+ unsigned i;
+
+ if (ca->set) {
+ BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
+ ca->set->cache[ca->sb.nr_this_dev] = NULL;
+ }
+
+ bio_split_pool_free(&ca->bio_split_hook);
+
+ free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+ kfree(ca->prio_buckets);
+ vfree(ca->buckets);
+
+ free_heap(&ca->heap);
+ free_fifo(&ca->free_inc);
+
+ for (i = 0; i < RESERVE_NR; i++)
+ free_fifo(&ca->free[i]);
+
+ if (ca->sb_bio.bi_inline_vecs[0].bv_page)
+ put_page(ca->sb_bio.bi_io_vec[0].bv_page);
+
+ if (!IS_ERR_OR_NULL(ca->bdev))
+ blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+ kfree(ca);
+ module_put(THIS_MODULE);
+}
+
+static int cache_alloc(struct cache_sb *sb, struct cache *ca)
+{
+ size_t free;
+ struct bucket *b;
+
+ __module_get(THIS_MODULE);
+ kobject_init(&ca->kobj, &bch_cache_ktype);
+
+ bio_init(&ca->journal.bio);
+ ca->journal.bio.bi_max_vecs = 8;
+ ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
+
+ free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
+
+ if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+ !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+ !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
+ !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
+ !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
+ !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
+ !(ca->buckets = vzalloc(sizeof(struct bucket) *
+ ca->sb.nbuckets)) ||
+ !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
+ 2, GFP_KERNEL)) ||
+ !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
+ bio_split_pool_init(&ca->bio_split_hook))
+ return -ENOMEM;
+
+ ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
+
+ for_each_bucket(b, ca)
+ atomic_set(&b->pin, 0);
+
+ return 0;
+}
+
+static void register_cache(struct cache_sb *sb, struct page *sb_page,
+ struct block_device *bdev, struct cache *ca)
+{
+ char name[BDEVNAME_SIZE];
+ const char *err = "cannot allocate memory";
+
+ memcpy(&ca->sb, sb, sizeof(struct cache_sb));
+ ca->bdev = bdev;
+ ca->bdev->bd_holder = ca;
+
+ bio_init(&ca->sb_bio);
+ ca->sb_bio.bi_max_vecs = 1;
+ ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
+ ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ get_page(sb_page);
+
+ if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+ ca->discard = CACHE_DISCARD(&ca->sb);
+
+ if (cache_alloc(sb, ca) != 0)
+ goto err;
+
+ err = "error creating kobject";
+ if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
+ goto err;
+
+ mutex_lock(&bch_register_lock);
+ err = register_cache_set(ca);
+ mutex_unlock(&bch_register_lock);
+
+ if (err)
+ goto err;
+
+ pr_info("registered cache device %s", bdevname(bdev, name));
+out:
+ kobject_put(&ca->kobj);
+ return;
+err:
+ pr_notice("error opening %s: %s", bdevname(bdev, name), err);
+ goto out;
+}
+
+/* Global interfaces/init */
+
+static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
+ const char *, size_t);
+
+kobj_attribute_write(register, register_bcache);
+kobj_attribute_write(register_quiet, register_bcache);
+
+static bool bch_is_open_backing(struct block_device *bdev) {
+ struct cache_set *c, *tc;
+ struct cached_dev *dc, *t;
+
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+ list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+ if (dc->bdev == bdev)
+ return true;
+ list_for_each_entry_safe(dc, t, &uncached_devices, list)
+ if (dc->bdev == bdev)
+ return true;
+ return false;
+}
+
+static bool bch_is_open_cache(struct block_device *bdev) {
+ struct cache_set *c, *tc;
+ struct cache *ca;
+ unsigned i;
+
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+ for_each_cache(ca, c, i)
+ if (ca->bdev == bdev)
+ return true;
+ return false;
+}
+
+static bool bch_is_open(struct block_device *bdev) {
+ return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
+}
+
+static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ const char *buffer, size_t size)
+{
+ ssize_t ret = size;
+ const char *err = "cannot allocate memory";
+ char *path = NULL;
+ struct cache_sb *sb = NULL;
+ struct block_device *bdev = NULL;
+ struct page *sb_page = NULL;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EBUSY;
+
+ if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
+ !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+ goto err;
+
+ err = "failed to open device";
+ bdev = blkdev_get_by_path(strim(path),
+ FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+ sb);
+ if (IS_ERR(bdev)) {
+ if (bdev == ERR_PTR(-EBUSY)) {
+ bdev = lookup_bdev(strim(path));
+ mutex_lock(&bch_register_lock);
+ if (!IS_ERR(bdev) && bch_is_open(bdev))
+ err = "device already registered";
+ else
+ err = "device busy";
+ mutex_unlock(&bch_register_lock);
+ }
+ goto err;
+ }
+
+ err = "failed to set blocksize";
+ if (set_blocksize(bdev, 4096))
+ goto err_close;
+
+ err = read_super(sb, bdev, &sb_page);
+ if (err)
+ goto err_close;
+
+ if (SB_IS_BDEV(sb)) {
+ struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+ if (!dc)
+ goto err_close;
+
+ mutex_lock(&bch_register_lock);
+ register_bdev(sb, sb_page, bdev, dc);
+ mutex_unlock(&bch_register_lock);
+ } else {
+ struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ goto err_close;
+
+ register_cache(sb, sb_page, bdev, ca);
+ }
+out:
+ if (sb_page)
+ put_page(sb_page);
+ kfree(sb);
+ kfree(path);
+ module_put(THIS_MODULE);
+ return ret;
+
+err_close:
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+err:
+ if (attr != &ksysfs_register_quiet)
+ pr_info("error opening %s: %s", path, err);
+ ret = -EINVAL;
+ goto out;
+}
+
+static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+ if (code == SYS_DOWN ||
+ code == SYS_HALT ||
+ code == SYS_POWER_OFF) {
+ DEFINE_WAIT(wait);
+ unsigned long start = jiffies;
+ bool stopped = false;
+
+ struct cache_set *c, *tc;
+ struct cached_dev *dc, *tdc;
+
+ mutex_lock(&bch_register_lock);
+
+ if (list_empty(&bch_cache_sets) &&
+ list_empty(&uncached_devices))
+ goto out;
+
+ pr_info("Stopping all devices:");
+
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+ bch_cache_set_stop(c);
+
+ list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
+ bcache_device_stop(&dc->disk);
+
+ /* What's a condition variable? */
+ while (1) {
+ long timeout = start + 2 * HZ - jiffies;
+
+ stopped = list_empty(&bch_cache_sets) &&
+ list_empty(&uncached_devices);
+
+ if (timeout < 0 || stopped)
+ break;
+
+ prepare_to_wait(&unregister_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_unlock(&bch_register_lock);
+ schedule_timeout(timeout);
+ mutex_lock(&bch_register_lock);
+ }
+
+ finish_wait(&unregister_wait, &wait);
+
+ if (stopped)
+ pr_info("All devices stopped");
+ else
+ pr_notice("Timeout waiting for devices to be closed");
+out:
+ mutex_unlock(&bch_register_lock);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block reboot = {
+ .notifier_call = bcache_reboot,
+ .priority = INT_MAX, /* before any real devices */
+};
+
+static void bcache_exit(void)
+{
+ bch_debug_exit();
+ bch_request_exit();
+ if (bcache_kobj)
+ kobject_put(bcache_kobj);
+ if (bcache_wq)
+ destroy_workqueue(bcache_wq);
+ if (bcache_major)
+ unregister_blkdev(bcache_major, "bcache");
+ unregister_reboot_notifier(&reboot);
+}
+
+static int __init bcache_init(void)
+{
+ static const struct attribute *files[] = {
+ &ksysfs_register.attr,
+ &ksysfs_register_quiet.attr,
+ NULL
+ };
+
+ mutex_init(&bch_register_lock);
+ init_waitqueue_head(&unregister_wait);
+ register_reboot_notifier(&reboot);
+ closure_debug_init();
+
+ bcache_major = register_blkdev(0, "bcache");
+ if (bcache_major < 0)
+ return bcache_major;
+
+ if (!(bcache_wq = create_workqueue("bcache")) ||
+ !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
+ sysfs_create_files(bcache_kobj, files) ||
+ bch_request_init() ||
+ bch_debug_init(bcache_kobj))
+ goto err;
+
+ return 0;
+err:
+ bcache_exit();
+ return -ENOMEM;
+}
+
+module_exit(bcache_exit);
+module_init(bcache_init);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
new file mode 100644
index 000000000..b3ff57d61
--- /dev/null
+++ b/drivers/md/bcache/sysfs.c
@@ -0,0 +1,898 @@
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "sysfs.h"
+#include "btree.h"
+#include "request.h"
+#include "writeback.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+
+static const char * const cache_replacement_policies[] = {
+ "lru",
+ "fifo",
+ "random",
+ NULL
+};
+
+static const char * const error_actions[] = {
+ "unregister",
+ "panic",
+ NULL
+};
+
+write_attribute(attach);
+write_attribute(detach);
+write_attribute(unregister);
+write_attribute(stop);
+write_attribute(clear_stats);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+write_attribute(flash_vol_create);
+
+read_attribute(bucket_size);
+read_attribute(block_size);
+read_attribute(nbuckets);
+read_attribute(tree_depth);
+read_attribute(root_usage_percent);
+read_attribute(priority_stats);
+read_attribute(btree_cache_size);
+read_attribute(btree_cache_max_chain);
+read_attribute(cache_available_percent);
+read_attribute(written);
+read_attribute(btree_written);
+read_attribute(metadata_written);
+read_attribute(active_journal_entries);
+
+sysfs_time_stats_attribute(btree_gc, sec, ms);
+sysfs_time_stats_attribute(btree_split, sec, us);
+sysfs_time_stats_attribute(btree_sort, ms, us);
+sysfs_time_stats_attribute(btree_read, ms, us);
+
+read_attribute(btree_nodes);
+read_attribute(btree_used_percent);
+read_attribute(average_key_size);
+read_attribute(dirty_data);
+read_attribute(bset_tree_stats);
+
+read_attribute(state);
+read_attribute(cache_read_races);
+read_attribute(writeback_keys_done);
+read_attribute(writeback_keys_failed);
+read_attribute(io_errors);
+read_attribute(congested);
+rw_attribute(congested_read_threshold_us);
+rw_attribute(congested_write_threshold_us);
+
+rw_attribute(sequential_cutoff);
+rw_attribute(data_csum);
+rw_attribute(cache_mode);
+rw_attribute(writeback_metadata);
+rw_attribute(writeback_running);
+rw_attribute(writeback_percent);
+rw_attribute(writeback_delay);
+rw_attribute(writeback_rate);
+
+rw_attribute(writeback_rate_update_seconds);
+rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_p_term_inverse);
+read_attribute(writeback_rate_debug);
+
+read_attribute(stripe_size);
+read_attribute(partial_stripes_expensive);
+
+rw_attribute(synchronous);
+rw_attribute(journal_delay_ms);
+rw_attribute(discard);
+rw_attribute(running);
+rw_attribute(label);
+rw_attribute(readahead);
+rw_attribute(errors);
+rw_attribute(io_error_limit);
+rw_attribute(io_error_halflife);
+rw_attribute(verify);
+rw_attribute(bypass_torture_test);
+rw_attribute(key_merging_disabled);
+rw_attribute(gc_always_rewrite);
+rw_attribute(expensive_debug_checks);
+rw_attribute(cache_replacement_policy);
+rw_attribute(btree_shrinker_disabled);
+rw_attribute(copy_gc_enabled);
+rw_attribute(size);
+
+SHOW(__bch_cached_dev)
+{
+ struct cached_dev *dc = container_of(kobj, struct cached_dev,
+ disk.kobj);
+ const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+
+#define var(stat) (dc->stat)
+
+ if (attr == &sysfs_cache_mode)
+ return bch_snprint_string_list(buf, PAGE_SIZE,
+ bch_cache_modes + 1,
+ BDEV_CACHE_MODE(&dc->sb));
+
+ sysfs_printf(data_csum, "%i", dc->disk.data_csum);
+ var_printf(verify, "%i");
+ var_printf(bypass_torture_test, "%i");
+ var_printf(writeback_metadata, "%i");
+ var_printf(writeback_running, "%i");
+ var_print(writeback_delay);
+ var_print(writeback_percent);
+ sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
+
+ var_print(writeback_rate_update_seconds);
+ var_print(writeback_rate_d_term);
+ var_print(writeback_rate_p_term_inverse);
+
+ if (attr == &sysfs_writeback_rate_debug) {
+ char rate[20];
+ char dirty[20];
+ char target[20];
+ char proportional[20];
+ char derivative[20];
+ char change[20];
+ s64 next_io;
+
+ bch_hprint(rate, dc->writeback_rate.rate << 9);
+ bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
+ bch_hprint(target, dc->writeback_rate_target << 9);
+ bch_hprint(proportional,dc->writeback_rate_proportional << 9);
+ bch_hprint(derivative, dc->writeback_rate_derivative << 9);
+ bch_hprint(change, dc->writeback_rate_change << 9);
+
+ next_io = div64_s64(dc->writeback_rate.next - local_clock(),
+ NSEC_PER_MSEC);
+
+ return sprintf(buf,
+ "rate:\t\t%s/sec\n"
+ "dirty:\t\t%s\n"
+ "target:\t\t%s\n"
+ "proportional:\t%s\n"
+ "derivative:\t%s\n"
+ "change:\t\t%s/sec\n"
+ "next io:\t%llims\n",
+ rate, dirty, target, proportional,
+ derivative, change, next_io);
+ }
+
+ sysfs_hprint(dirty_data,
+ bcache_dev_sectors_dirty(&dc->disk) << 9);
+
+ sysfs_hprint(stripe_size, dc->disk.stripe_size << 9);
+ var_printf(partial_stripes_expensive, "%u");
+
+ var_hprint(sequential_cutoff);
+ var_hprint(readahead);
+
+ sysfs_print(running, atomic_read(&dc->running));
+ sysfs_print(state, states[BDEV_STATE(&dc->sb)]);
+
+ if (attr == &sysfs_label) {
+ memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
+ buf[SB_LABEL_SIZE + 1] = '\0';
+ strcat(buf, "\n");
+ return strlen(buf);
+ }
+
+#undef var
+ return 0;
+}
+SHOW_LOCKED(bch_cached_dev)
+
+STORE(__cached_dev)
+{
+ struct cached_dev *dc = container_of(kobj, struct cached_dev,
+ disk.kobj);
+ unsigned v = size;
+ struct cache_set *c;
+ struct kobj_uevent_env *env;
+
+#define d_strtoul(var) sysfs_strtoul(var, dc->var)
+#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
+#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
+
+ sysfs_strtoul(data_csum, dc->disk.data_csum);
+ d_strtoul(verify);
+ d_strtoul(bypass_torture_test);
+ d_strtoul(writeback_metadata);
+ d_strtoul(writeback_running);
+ d_strtoul(writeback_delay);
+
+ sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+
+ sysfs_strtoul_clamp(writeback_rate,
+ dc->writeback_rate.rate, 1, INT_MAX);
+
+ d_strtoul_nonzero(writeback_rate_update_seconds);
+ d_strtoul(writeback_rate_d_term);
+ d_strtoul_nonzero(writeback_rate_p_term_inverse);
+
+ d_strtoi_h(sequential_cutoff);
+ d_strtoi_h(readahead);
+
+ if (attr == &sysfs_clear_stats)
+ bch_cache_accounting_clear(&dc->accounting);
+
+ if (attr == &sysfs_running &&
+ strtoul_or_return(buf))
+ bch_cached_dev_run(dc);
+
+ if (attr == &sysfs_cache_mode) {
+ ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+
+ if (v < 0)
+ return v;
+
+ if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) {
+ SET_BDEV_CACHE_MODE(&dc->sb, v);
+ bch_write_bdev_super(dc, NULL);
+ }
+ }
+
+ if (attr == &sysfs_label) {
+ if (size > SB_LABEL_SIZE)
+ return -EINVAL;
+ memcpy(dc->sb.label, buf, size);
+ if (size < SB_LABEL_SIZE)
+ dc->sb.label[size] = '\0';
+ if (size && dc->sb.label[size - 1] == '\n')
+ dc->sb.label[size - 1] = '\0';
+ bch_write_bdev_super(dc, NULL);
+ if (dc->disk.c) {
+ memcpy(dc->disk.c->uuids[dc->disk.id].label,
+ buf, SB_LABEL_SIZE);
+ bch_uuid_write(dc->disk.c);
+ }
+ env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+ add_uevent_var(env, "DRIVER=bcache");
+ add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
+ add_uevent_var(env, "CACHED_LABEL=%s", buf);
+ kobject_uevent_env(
+ &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
+ kfree(env);
+ }
+
+ if (attr == &sysfs_attach) {
+ if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16)
+ return -EINVAL;
+
+ list_for_each_entry(c, &bch_cache_sets, list) {
+ v = bch_cached_dev_attach(dc, c);
+ if (!v)
+ return size;
+ }
+
+ pr_err("Can't attach %s: cache set not found", buf);
+ size = v;
+ }
+
+ if (attr == &sysfs_detach && dc->disk.c)
+ bch_cached_dev_detach(dc);
+
+ if (attr == &sysfs_stop)
+ bcache_device_stop(&dc->disk);
+
+ return size;
+}
+
+STORE(bch_cached_dev)
+{
+ struct cached_dev *dc = container_of(kobj, struct cached_dev,
+ disk.kobj);
+
+ mutex_lock(&bch_register_lock);
+ size = __cached_dev_store(kobj, attr, buf, size);
+
+ if (attr == &sysfs_writeback_running)
+ bch_writeback_queue(dc);
+
+ if (attr == &sysfs_writeback_percent)
+ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+
+ mutex_unlock(&bch_register_lock);
+ return size;
+}
+
+static struct attribute *bch_cached_dev_files[] = {
+ &sysfs_attach,
+ &sysfs_detach,
+ &sysfs_stop,
+#if 0
+ &sysfs_data_csum,
+#endif
+ &sysfs_cache_mode,
+ &sysfs_writeback_metadata,
+ &sysfs_writeback_running,
+ &sysfs_writeback_delay,
+ &sysfs_writeback_percent,
+ &sysfs_writeback_rate,
+ &sysfs_writeback_rate_update_seconds,
+ &sysfs_writeback_rate_d_term,
+ &sysfs_writeback_rate_p_term_inverse,
+ &sysfs_writeback_rate_debug,
+ &sysfs_dirty_data,
+ &sysfs_stripe_size,
+ &sysfs_partial_stripes_expensive,
+ &sysfs_sequential_cutoff,
+ &sysfs_clear_stats,
+ &sysfs_running,
+ &sysfs_state,
+ &sysfs_label,
+ &sysfs_readahead,
+#ifdef CONFIG_BCACHE_DEBUG
+ &sysfs_verify,
+ &sysfs_bypass_torture_test,
+#endif
+ NULL
+};
+KTYPE(bch_cached_dev);
+
+SHOW(bch_flash_dev)
+{
+ struct bcache_device *d = container_of(kobj, struct bcache_device,
+ kobj);
+ struct uuid_entry *u = &d->c->uuids[d->id];
+
+ sysfs_printf(data_csum, "%i", d->data_csum);
+ sysfs_hprint(size, u->sectors << 9);
+
+ if (attr == &sysfs_label) {
+ memcpy(buf, u->label, SB_LABEL_SIZE);
+ buf[SB_LABEL_SIZE + 1] = '\0';
+ strcat(buf, "\n");
+ return strlen(buf);
+ }
+
+ return 0;
+}
+
+STORE(__bch_flash_dev)
+{
+ struct bcache_device *d = container_of(kobj, struct bcache_device,
+ kobj);
+ struct uuid_entry *u = &d->c->uuids[d->id];
+
+ sysfs_strtoul(data_csum, d->data_csum);
+
+ if (attr == &sysfs_size) {
+ uint64_t v;
+ strtoi_h_or_return(buf, v);
+
+ u->sectors = v >> 9;
+ bch_uuid_write(d->c);
+ set_capacity(d->disk, u->sectors);
+ }
+
+ if (attr == &sysfs_label) {
+ memcpy(u->label, buf, SB_LABEL_SIZE);
+ bch_uuid_write(d->c);
+ }
+
+ if (attr == &sysfs_unregister) {
+ set_bit(BCACHE_DEV_DETACHING, &d->flags);
+ bcache_device_stop(d);
+ }
+
+ return size;
+}
+STORE_LOCKED(bch_flash_dev)
+
+static struct attribute *bch_flash_dev_files[] = {
+ &sysfs_unregister,
+#if 0
+ &sysfs_data_csum,
+#endif
+ &sysfs_label,
+ &sysfs_size,
+ NULL
+};
+KTYPE(bch_flash_dev);
+
+struct bset_stats_op {
+ struct btree_op op;
+ size_t nodes;
+ struct bset_stats stats;
+};
+
+static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b)
+{
+ struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
+
+ op->nodes++;
+ bch_btree_keys_stats(&b->keys, &op->stats);
+
+ return MAP_CONTINUE;
+}
+
+static int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+ struct bset_stats_op op;
+ int ret;
+
+ memset(&op, 0, sizeof(op));
+ bch_btree_op_init(&op.op, -1);
+
+ ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats);
+ if (ret < 0)
+ return ret;
+
+ return snprintf(buf, PAGE_SIZE,
+ "btree nodes: %zu\n"
+ "written sets: %zu\n"
+ "unwritten sets: %zu\n"
+ "written key bytes: %zu\n"
+ "unwritten key bytes: %zu\n"
+ "floats: %zu\n"
+ "failed: %zu\n",
+ op.nodes,
+ op.stats.sets_written, op.stats.sets_unwritten,
+ op.stats.bytes_written, op.stats.bytes_unwritten,
+ op.stats.floats, op.stats.failed);
+}
+
+static unsigned bch_root_usage(struct cache_set *c)
+{
+ unsigned bytes = 0;
+ struct bkey *k;
+ struct btree *b;
+ struct btree_iter iter;
+
+ goto lock_root;
+
+ do {
+ rw_unlock(false, b);
+lock_root:
+ b = c->root;
+ rw_lock(false, b, b->level);
+ } while (b != c->root);
+
+ for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
+ bytes += bkey_bytes(k);
+
+ rw_unlock(false, b);
+
+ return (bytes * 100) / btree_bytes(c);
+}
+
+static size_t bch_cache_size(struct cache_set *c)
+{
+ size_t ret = 0;
+ struct btree *b;
+
+ mutex_lock(&c->bucket_lock);
+ list_for_each_entry(b, &c->btree_cache, list)
+ ret += 1 << (b->keys.page_order + PAGE_SHIFT);
+
+ mutex_unlock(&c->bucket_lock);
+ return ret;
+}
+
+static unsigned bch_cache_max_chain(struct cache_set *c)
+{
+ unsigned ret = 0;
+ struct hlist_head *h;
+
+ mutex_lock(&c->bucket_lock);
+
+ for (h = c->bucket_hash;
+ h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
+ h++) {
+ unsigned i = 0;
+ struct hlist_node *p;
+
+ hlist_for_each(p, h)
+ i++;
+
+ ret = max(ret, i);
+ }
+
+ mutex_unlock(&c->bucket_lock);
+ return ret;
+}
+
+static unsigned bch_btree_used(struct cache_set *c)
+{
+ return div64_u64(c->gc_stats.key_bytes * 100,
+ (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+}
+
+static unsigned bch_average_key_size(struct cache_set *c)
+{
+ return c->gc_stats.nkeys
+ ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+ : 0;
+}
+
+SHOW(__bch_cache_set)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+ sysfs_print(synchronous, CACHE_SYNC(&c->sb));
+ sysfs_print(journal_delay_ms, c->journal_delay_ms);
+ sysfs_hprint(bucket_size, bucket_bytes(c));
+ sysfs_hprint(block_size, block_bytes(c));
+ sysfs_print(tree_depth, c->root->level);
+ sysfs_print(root_usage_percent, bch_root_usage(c));
+
+ sysfs_hprint(btree_cache_size, bch_cache_size(c));
+ sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
+ sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
+
+ sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
+ sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
+ sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
+ sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
+
+ sysfs_print(btree_used_percent, bch_btree_used(c));
+ sysfs_print(btree_nodes, c->gc_stats.nodes);
+ sysfs_hprint(average_key_size, bch_average_key_size(c));
+
+ sysfs_print(cache_read_races,
+ atomic_long_read(&c->cache_read_races));
+
+ sysfs_print(writeback_keys_done,
+ atomic_long_read(&c->writeback_keys_done));
+ sysfs_print(writeback_keys_failed,
+ atomic_long_read(&c->writeback_keys_failed));
+
+ if (attr == &sysfs_errors)
+ return bch_snprint_string_list(buf, PAGE_SIZE, error_actions,
+ c->on_error);
+
+ /* See count_io_errors for why 88 */
+ sysfs_print(io_error_halflife, c->error_decay * 88);
+ sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT);
+
+ sysfs_hprint(congested,
+ ((uint64_t) bch_get_congested(c)) << 9);
+ sysfs_print(congested_read_threshold_us,
+ c->congested_read_threshold_us);
+ sysfs_print(congested_write_threshold_us,
+ c->congested_write_threshold_us);
+
+ sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
+ sysfs_printf(verify, "%i", c->verify);
+ sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
+ sysfs_printf(expensive_debug_checks,
+ "%i", c->expensive_debug_checks);
+ sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
+ sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+
+ if (attr == &sysfs_bset_tree_stats)
+ return bch_bset_print_stats(c, buf);
+
+ return 0;
+}
+SHOW_LOCKED(bch_cache_set)
+
+STORE(__bch_cache_set)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+ if (attr == &sysfs_unregister)
+ bch_cache_set_unregister(c);
+
+ if (attr == &sysfs_stop)
+ bch_cache_set_stop(c);
+
+ if (attr == &sysfs_synchronous) {
+ bool sync = strtoul_or_return(buf);
+
+ if (sync != CACHE_SYNC(&c->sb)) {
+ SET_CACHE_SYNC(&c->sb, sync);
+ bcache_write_super(c);
+ }
+ }
+
+ if (attr == &sysfs_flash_vol_create) {
+ int r;
+ uint64_t v;
+ strtoi_h_or_return(buf, v);
+
+ r = bch_flash_dev_create(c, v);
+ if (r)
+ return r;
+ }
+
+ if (attr == &sysfs_clear_stats) {
+ atomic_long_set(&c->writeback_keys_done, 0);
+ atomic_long_set(&c->writeback_keys_failed, 0);
+
+ memset(&c->gc_stats, 0, sizeof(struct gc_stat));
+ bch_cache_accounting_clear(&c->accounting);
+ }
+
+ if (attr == &sysfs_trigger_gc)
+ wake_up_gc(c);
+
+ if (attr == &sysfs_prune_cache) {
+ struct shrink_control sc;
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->shrink.scan_objects(&c->shrink, &sc);
+ }
+
+ sysfs_strtoul(congested_read_threshold_us,
+ c->congested_read_threshold_us);
+ sysfs_strtoul(congested_write_threshold_us,
+ c->congested_write_threshold_us);
+
+ if (attr == &sysfs_errors) {
+ ssize_t v = bch_read_string_list(buf, error_actions);
+
+ if (v < 0)
+ return v;
+
+ c->on_error = v;
+ }
+
+ if (attr == &sysfs_io_error_limit)
+ c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+
+ /* See count_io_errors() for why 88 */
+ if (attr == &sysfs_io_error_halflife)
+ c->error_decay = strtoul_or_return(buf) / 88;
+
+ sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
+ sysfs_strtoul(verify, c->verify);
+ sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
+ sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
+ sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
+ sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
+ sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
+
+ return size;
+}
+STORE_LOCKED(bch_cache_set)
+
+SHOW(bch_cache_set_internal)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, internal);
+ return bch_cache_set_show(&c->kobj, attr, buf);
+}
+
+STORE(bch_cache_set_internal)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, internal);
+ return bch_cache_set_store(&c->kobj, attr, buf, size);
+}
+
+static void bch_cache_set_internal_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_files[] = {
+ &sysfs_unregister,
+ &sysfs_stop,
+ &sysfs_synchronous,
+ &sysfs_journal_delay_ms,
+ &sysfs_flash_vol_create,
+
+ &sysfs_bucket_size,
+ &sysfs_block_size,
+ &sysfs_tree_depth,
+ &sysfs_root_usage_percent,
+ &sysfs_btree_cache_size,
+ &sysfs_cache_available_percent,
+
+ &sysfs_average_key_size,
+
+ &sysfs_errors,
+ &sysfs_io_error_limit,
+ &sysfs_io_error_halflife,
+ &sysfs_congested,
+ &sysfs_congested_read_threshold_us,
+ &sysfs_congested_write_threshold_us,
+ &sysfs_clear_stats,
+ NULL
+};
+KTYPE(bch_cache_set);
+
+static struct attribute *bch_cache_set_internal_files[] = {
+ &sysfs_active_journal_entries,
+
+ sysfs_time_stats_attribute_list(btree_gc, sec, ms)
+ sysfs_time_stats_attribute_list(btree_split, sec, us)
+ sysfs_time_stats_attribute_list(btree_sort, ms, us)
+ sysfs_time_stats_attribute_list(btree_read, ms, us)
+
+ &sysfs_btree_nodes,
+ &sysfs_btree_used_percent,
+ &sysfs_btree_cache_max_chain,
+
+ &sysfs_bset_tree_stats,
+ &sysfs_cache_read_races,
+ &sysfs_writeback_keys_done,
+ &sysfs_writeback_keys_failed,
+
+ &sysfs_trigger_gc,
+ &sysfs_prune_cache,
+#ifdef CONFIG_BCACHE_DEBUG
+ &sysfs_verify,
+ &sysfs_key_merging_disabled,
+ &sysfs_expensive_debug_checks,
+#endif
+ &sysfs_gc_always_rewrite,
+ &sysfs_btree_shrinker_disabled,
+ &sysfs_copy_gc_enabled,
+ NULL
+};
+KTYPE(bch_cache_set_internal);
+
+SHOW(__bch_cache)
+{
+ struct cache *ca = container_of(kobj, struct cache, kobj);
+
+ sysfs_hprint(bucket_size, bucket_bytes(ca));
+ sysfs_hprint(block_size, block_bytes(ca));
+ sysfs_print(nbuckets, ca->sb.nbuckets);
+ sysfs_print(discard, ca->discard);
+ sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
+ sysfs_hprint(btree_written,
+ atomic_long_read(&ca->btree_sectors_written) << 9);
+ sysfs_hprint(metadata_written,
+ (atomic_long_read(&ca->meta_sectors_written) +
+ atomic_long_read(&ca->btree_sectors_written)) << 9);
+
+ sysfs_print(io_errors,
+ atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
+
+ if (attr == &sysfs_cache_replacement_policy)
+ return bch_snprint_string_list(buf, PAGE_SIZE,
+ cache_replacement_policies,
+ CACHE_REPLACEMENT(&ca->sb));
+
+ if (attr == &sysfs_priority_stats) {
+ int cmp(const void *l, const void *r)
+ { return *((uint16_t *) r) - *((uint16_t *) l); }
+
+ struct bucket *b;
+ size_t n = ca->sb.nbuckets, i;
+ size_t unused = 0, available = 0, dirty = 0, meta = 0;
+ uint64_t sum = 0;
+ /* Compute 31 quantiles */
+ uint16_t q[31], *p, *cached;
+ ssize_t ret;
+
+ cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
+ if (!p)
+ return -ENOMEM;
+
+ mutex_lock(&ca->set->bucket_lock);
+ for_each_bucket(b, ca) {
+ if (!GC_SECTORS_USED(b))
+ unused++;
+ if (GC_MARK(b) == GC_MARK_RECLAIMABLE)
+ available++;
+ if (GC_MARK(b) == GC_MARK_DIRTY)
+ dirty++;
+ if (GC_MARK(b) == GC_MARK_METADATA)
+ meta++;
+ }
+
+ for (i = ca->sb.first_bucket; i < n; i++)
+ p[i] = ca->buckets[i].prio;
+ mutex_unlock(&ca->set->bucket_lock);
+
+ sort(p, n, sizeof(uint16_t), cmp, NULL);
+
+ while (n &&
+ !cached[n - 1])
+ --n;
+
+ unused = ca->sb.nbuckets - n;
+
+ while (cached < p + n &&
+ *cached == BTREE_PRIO)
+ cached++, n--;
+
+ for (i = 0; i < n; i++)
+ sum += INITIAL_PRIO - cached[i];
+
+ if (n)
+ do_div(sum, n);
+
+ for (i = 0; i < ARRAY_SIZE(q); i++)
+ q[i] = INITIAL_PRIO - cached[n * (i + 1) /
+ (ARRAY_SIZE(q) + 1)];
+
+ vfree(p);
+
+ ret = scnprintf(buf, PAGE_SIZE,
+ "Unused: %zu%%\n"
+ "Clean: %zu%%\n"
+ "Dirty: %zu%%\n"
+ "Metadata: %zu%%\n"
+ "Average: %llu\n"
+ "Sectors per Q: %zu\n"
+ "Quantiles: [",
+ unused * 100 / (size_t) ca->sb.nbuckets,
+ available * 100 / (size_t) ca->sb.nbuckets,
+ dirty * 100 / (size_t) ca->sb.nbuckets,
+ meta * 100 / (size_t) ca->sb.nbuckets, sum,
+ n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
+
+ for (i = 0; i < ARRAY_SIZE(q); i++)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret,
+ "%u ", q[i]);
+ ret--;
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
+
+ return ret;
+ }
+
+ return 0;
+}
+SHOW_LOCKED(bch_cache)
+
+STORE(__bch_cache)
+{
+ struct cache *ca = container_of(kobj, struct cache, kobj);
+
+ if (attr == &sysfs_discard) {
+ bool v = strtoul_or_return(buf);
+
+ if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+ ca->discard = v;
+
+ if (v != CACHE_DISCARD(&ca->sb)) {
+ SET_CACHE_DISCARD(&ca->sb, v);
+ bcache_write_super(ca->set);
+ }
+ }
+
+ if (attr == &sysfs_cache_replacement_policy) {
+ ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
+
+ if (v < 0)
+ return v;
+
+ if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) {
+ mutex_lock(&ca->set->bucket_lock);
+ SET_CACHE_REPLACEMENT(&ca->sb, v);
+ mutex_unlock(&ca->set->bucket_lock);
+
+ bcache_write_super(ca->set);
+ }
+ }
+
+ if (attr == &sysfs_clear_stats) {
+ atomic_long_set(&ca->sectors_written, 0);
+ atomic_long_set(&ca->btree_sectors_written, 0);
+ atomic_long_set(&ca->meta_sectors_written, 0);
+ atomic_set(&ca->io_count, 0);
+ atomic_set(&ca->io_errors, 0);
+ }
+
+ return size;
+}
+STORE_LOCKED(bch_cache)
+
+static struct attribute *bch_cache_files[] = {
+ &sysfs_bucket_size,
+ &sysfs_block_size,
+ &sysfs_nbuckets,
+ &sysfs_priority_stats,
+ &sysfs_discard,
+ &sysfs_written,
+ &sysfs_btree_written,
+ &sysfs_metadata_written,
+ &sysfs_io_errors,
+ &sysfs_clear_stats,
+ &sysfs_cache_replacement_policy,
+ NULL
+};
+KTYPE(bch_cache);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
new file mode 100644
index 000000000..0526fe92a
--- /dev/null
+++ b/drivers/md/bcache/sysfs.h
@@ -0,0 +1,110 @@
+#ifndef _BCACHE_SYSFS_H_
+#define _BCACHE_SYSFS_H_
+
+#define KTYPE(type) \
+struct kobj_type type ## _ktype = { \
+ .release = type ## _release, \
+ .sysfs_ops = &((const struct sysfs_ops) { \
+ .show = type ## _show, \
+ .store = type ## _store \
+ }), \
+ .default_attrs = type ## _files \
+}
+
+#define SHOW(fn) \
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+ char *buf) \
+
+#define STORE(fn) \
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+ const char *buf, size_t size) \
+
+#define SHOW_LOCKED(fn) \
+SHOW(fn) \
+{ \
+ ssize_t ret; \
+ mutex_lock(&bch_register_lock); \
+ ret = __ ## fn ## _show(kobj, attr, buf); \
+ mutex_unlock(&bch_register_lock); \
+ return ret; \
+}
+
+#define STORE_LOCKED(fn) \
+STORE(fn) \
+{ \
+ ssize_t ret; \
+ mutex_lock(&bch_register_lock); \
+ ret = __ ## fn ## _store(kobj, attr, buf, size); \
+ mutex_unlock(&bch_register_lock); \
+ return ret; \
+}
+
+#define __sysfs_attribute(_name, _mode) \
+ static struct attribute sysfs_##_name = \
+ { .name = #_name, .mode = _mode }
+
+#define write_attribute(n) __sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n) __sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \
+} while (0)
+
+#define sysfs_print(file, var) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ return snprint(buf, PAGE_SIZE, var); \
+} while (0)
+
+#define sysfs_hprint(file, val) \
+do { \
+ if (attr == &sysfs_ ## file) { \
+ ssize_t ret = bch_hprint(buf, val); \
+ strcat(buf, "\n"); \
+ return ret + 1; \
+ } \
+} while (0)
+
+#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var) sysfs_print(_var, var(_var))
+#define var_hprint(_var) sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ return strtoul_safe(buf, var) ?: (ssize_t) size; \
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ return strtoul_safe_clamp(buf, var, min, max) \
+ ?: (ssize_t) size; \
+} while (0)
+
+#define strtoul_or_return(cp) \
+({ \
+ unsigned long _v; \
+ int _r = kstrtoul(cp, 10, &_v); \
+ if (_r) \
+ return _r; \
+ _v; \
+})
+
+#define strtoi_h_or_return(cp, v) \
+do { \
+ int _r = strtoi_h(cp, &v); \
+ if (_r) \
+ return _r; \
+} while (0)
+
+#define sysfs_hatoi(file, var) \
+do { \
+ if (attr == &sysfs_ ## file) \
+ return strtoi_h(buf, &var) ?: (ssize_t) size; \
+} while (0)
+
+#endif /* _BCACHE_SYSFS_H_ */
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
new file mode 100644
index 000000000..b7820b0d2
--- /dev/null
+++ b/drivers/md/bcache/trace.c
@@ -0,0 +1,52 @@
+#include "bcache.h"
+#include "btree.h"
+
+#include <linux/blktrace_api.h>
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bcache.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
new file mode 100644
index 000000000..db3ae4c2b
--- /dev/null
+++ b/drivers/md/bcache/util.c
@@ -0,0 +1,381 @@
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+
+#include "util.h"
+
+#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
+
+#define STRTO_H(name, type) \
+int bch_ ## name ## _h(const char *cp, type *res) \
+{ \
+ int u = 0; \
+ char *e; \
+ type i = simple_ ## name(cp, &e, 10); \
+ \
+ switch (tolower(*e)) { \
+ default: \
+ return -EINVAL; \
+ case 'y': \
+ case 'z': \
+ u++; \
+ case 'e': \
+ u++; \
+ case 'p': \
+ u++; \
+ case 't': \
+ u++; \
+ case 'g': \
+ u++; \
+ case 'm': \
+ u++; \
+ case 'k': \
+ u++; \
+ if (e++ == cp) \
+ return -EINVAL; \
+ case '\n': \
+ case '\0': \
+ if (*e == '\n') \
+ e++; \
+ } \
+ \
+ if (*e) \
+ return -EINVAL; \
+ \
+ while (u--) { \
+ if ((type) ~0 > 0 && \
+ (type) ~0 / 1024 <= i) \
+ return -EINVAL; \
+ if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \
+ (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \
+ return -EINVAL; \
+ i *= 1024; \
+ } \
+ \
+ *res = i; \
+ return 0; \
+} \
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t bch_hprint(char *buf, int64_t v)
+{
+ static const char units[] = "?kMGTPEZY";
+ char dec[4] = "";
+ int u, t = 0;
+
+ for (u = 0; v >= 1024 || v <= -1024; u++) {
+ t = v & ~(~0 << 10);
+ v >>= 10;
+ }
+
+ if (!u)
+ return sprintf(buf, "%llu", v);
+
+ if (v < 100 && v > -100)
+ snprintf(dec, sizeof(dec), ".%i", t / 100);
+
+ return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+}
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+ size_t selected)
+{
+ char *out = buf;
+ size_t i;
+
+ for (i = 0; list[i]; i++)
+ out += snprintf(out, buf + size - out,
+ i == selected ? "[%s] " : "%s ", list[i]);
+
+ out[-1] = '\n';
+ return out - buf;
+}
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[])
+{
+ size_t i;
+ char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ s = strim(d);
+
+ for (i = 0; list[i]; i++)
+ if (!strcmp(list[i], s))
+ break;
+
+ kfree(d);
+
+ if (!list[i])
+ return -EINVAL;
+
+ return i;
+}
+
+bool bch_is_zero(const char *p, size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++)
+ if (p[i])
+ return false;
+ return true;
+}
+
+int bch_parse_uuid(const char *s, char *uuid)
+{
+ size_t i, j, x;
+ memset(uuid, 0, 16);
+
+ for (i = 0, j = 0;
+ i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
+ i++) {
+ x = s[i] | 32;
+
+ switch (x) {
+ case '0'...'9':
+ x -= '0';
+ break;
+ case 'a'...'f':
+ x -= 'a' - 10;
+ break;
+ default:
+ continue;
+ }
+
+ if (!(j & 1))
+ x <<= 4;
+ uuid[j++ >> 1] |= x;
+ }
+ return i;
+}
+
+void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
+{
+ uint64_t now, duration, last;
+
+ spin_lock(&stats->lock);
+
+ now = local_clock();
+ duration = time_after64(now, start_time)
+ ? now - start_time : 0;
+ last = time_after64(now, stats->last)
+ ? now - stats->last : 0;
+
+ stats->max_duration = max(stats->max_duration, duration);
+
+ if (stats->last) {
+ ewma_add(stats->average_duration, duration, 8, 8);
+
+ if (stats->average_frequency)
+ ewma_add(stats->average_frequency, last, 8, 8);
+ else
+ stats->average_frequency = last << 8;
+ } else {
+ stats->average_duration = duration << 8;
+ }
+
+ stats->last = now ?: 1;
+
+ spin_unlock(&stats->lock);
+}
+
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
+{
+ uint64_t now = local_clock();
+
+ d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+ if (time_before64(now + NSEC_PER_SEC, d->next))
+ d->next = now + NSEC_PER_SEC;
+
+ if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+ d->next = now - NSEC_PER_SEC * 2;
+
+ return time_after64(d->next, now)
+ ? div_u64(d->next - now, NSEC_PER_SEC / HZ)
+ : 0;
+}
+
+void bch_bio_map(struct bio *bio, void *base)
+{
+ size_t size = bio->bi_iter.bi_size;
+ struct bio_vec *bv = bio->bi_io_vec;
+
+ BUG_ON(!bio->bi_iter.bi_size);
+ BUG_ON(bio->bi_vcnt);
+
+ bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
+ goto start;
+
+ for (; size; bio->bi_vcnt++, bv++) {
+ bv->bv_offset = 0;
+start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
+ size);
+ if (base) {
+ bv->bv_page = is_vmalloc_addr(base)
+ ? vmalloc_to_page(base)
+ : virt_to_page(base);
+
+ base += bv->bv_len;
+ }
+
+ size -= bv->bv_len;
+ }
+}
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const uint64_t crc_table[256] = {
+ 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
+ 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
+ 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
+ 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
+ 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
+ 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
+ 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
+ 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
+ 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
+ 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
+ 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
+ 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
+ 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
+ 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
+ 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
+ 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
+ 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
+ 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
+ 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
+ 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
+ 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
+ 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
+ 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
+ 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
+ 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
+ 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
+ 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
+ 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
+ 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
+ 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
+ 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
+ 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
+ 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
+ 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
+ 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
+ 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
+ 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
+ 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
+ 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
+ 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
+ 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
+ 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
+ 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
+ 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
+ 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
+ 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
+ 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
+ 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
+ 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
+ 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
+ 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
+ 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
+ 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
+ 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
+ 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
+ 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
+ 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
+ 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
+ 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
+ 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
+ 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
+ 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
+ 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
+ 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
+ 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
+ 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
+ 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
+ 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
+ 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
+ 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
+ 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
+ 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
+ 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
+ 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
+ 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
+ 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
+ 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
+ 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
+ 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
+ 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
+ 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
+ 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
+ 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
+ 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
+ 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
+ 0x9AFCE626CE85B507ULL,
+};
+
+uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len)
+{
+ const unsigned char *data = _data;
+
+ while (len--) {
+ int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+ crc = crc_table[i] ^ (crc << 8);
+ }
+
+ return crc;
+}
+
+uint64_t bch_crc64(const void *data, size_t len)
+{
+ uint64_t crc = 0xffffffffffffffffULL;
+
+ crc = bch_crc64_update(crc, data, len);
+
+ return crc ^ 0xffffffffffffffffULL;
+}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
new file mode 100644
index 000000000..98df7572b
--- /dev/null
+++ b/drivers/md/bcache/util.h
@@ -0,0 +1,588 @@
+
+#ifndef _BCACHE_UTIL_H
+#define _BCACHE_UTIL_H
+
+#include <linux/blkdev.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/llist.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "closure.h"
+
+#define PAGE_SECTORS (PAGE_SIZE / 512)
+
+struct closure;
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+#define EBUG_ON(cond) BUG_ON(cond)
+#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
+
+#else /* DEBUG */
+
+#define EBUG_ON(cond) do { if (cond); } while (0)
+#define atomic_dec_bug(v) atomic_dec(v)
+#define atomic_inc_bug(v, i) atomic_inc(v)
+
+#endif
+
+#define DECLARE_HEAP(type, name) \
+ struct { \
+ size_t size, used; \
+ type *data; \
+ } name
+
+#define init_heap(heap, _size, gfp) \
+({ \
+ size_t _bytes; \
+ (heap)->used = 0; \
+ (heap)->size = (_size); \
+ _bytes = (heap)->size * sizeof(*(heap)->data); \
+ (heap)->data = NULL; \
+ if (_bytes < KMALLOC_MAX_SIZE) \
+ (heap)->data = kmalloc(_bytes, (gfp)); \
+ if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \
+ (heap)->data = vmalloc(_bytes); \
+ (heap)->data; \
+})
+
+#define free_heap(heap) \
+do { \
+ if (is_vmalloc_addr((heap)->data)) \
+ vfree((heap)->data); \
+ else \
+ kfree((heap)->data); \
+ (heap)->data = NULL; \
+} while (0)
+
+#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp) \
+do { \
+ size_t _r, _j = i; \
+ \
+ for (; _j * 2 + 1 < (h)->used; _j = _r) { \
+ _r = _j * 2 + 1; \
+ if (_r + 1 < (h)->used && \
+ cmp((h)->data[_r], (h)->data[_r + 1])) \
+ _r++; \
+ \
+ if (cmp((h)->data[_r], (h)->data[_j])) \
+ break; \
+ heap_swap(h, _r, _j); \
+ } \
+} while (0)
+
+#define heap_sift_down(h, i, cmp) \
+do { \
+ while (i) { \
+ size_t p = (i - 1) / 2; \
+ if (cmp((h)->data[i], (h)->data[p])) \
+ break; \
+ heap_swap(h, i, p); \
+ i = p; \
+ } \
+} while (0)
+
+#define heap_add(h, d, cmp) \
+({ \
+ bool _r = !heap_full(h); \
+ if (_r) { \
+ size_t _i = (h)->used++; \
+ (h)->data[_i] = d; \
+ \
+ heap_sift_down(h, _i, cmp); \
+ heap_sift(h, _i, cmp); \
+ } \
+ _r; \
+})
+
+#define heap_pop(h, d, cmp) \
+({ \
+ bool _r = (h)->used; \
+ if (_r) { \
+ (d) = (h)->data[0]; \
+ (h)->used--; \
+ heap_swap(h, 0, (h)->used); \
+ heap_sift(h, 0, cmp); \
+ } \
+ _r; \
+})
+
+#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL)
+
+#define heap_full(h) ((h)->used == (h)->size)
+
+#define DECLARE_FIFO(type, name) \
+ struct { \
+ size_t front, back, size, mask; \
+ type *data; \
+ } name
+
+#define fifo_for_each(c, fifo, iter) \
+ for (iter = (fifo)->front; \
+ c = (fifo)->data[iter], iter != (fifo)->back; \
+ iter = (iter + 1) & (fifo)->mask)
+
+#define __init_fifo(fifo, gfp) \
+({ \
+ size_t _allocated_size, _bytes; \
+ BUG_ON(!(fifo)->size); \
+ \
+ _allocated_size = roundup_pow_of_two((fifo)->size + 1); \
+ _bytes = _allocated_size * sizeof(*(fifo)->data); \
+ \
+ (fifo)->mask = _allocated_size - 1; \
+ (fifo)->front = (fifo)->back = 0; \
+ (fifo)->data = NULL; \
+ \
+ if (_bytes < KMALLOC_MAX_SIZE) \
+ (fifo)->data = kmalloc(_bytes, (gfp)); \
+ if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \
+ (fifo)->data = vmalloc(_bytes); \
+ (fifo)->data; \
+})
+
+#define init_fifo_exact(fifo, _size, gfp) \
+({ \
+ (fifo)->size = (_size); \
+ __init_fifo(fifo, gfp); \
+})
+
+#define init_fifo(fifo, _size, gfp) \
+({ \
+ (fifo)->size = (_size); \
+ if ((fifo)->size > 4) \
+ (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \
+ __init_fifo(fifo, gfp); \
+})
+
+#define free_fifo(fifo) \
+do { \
+ if (is_vmalloc_addr((fifo)->data)) \
+ vfree((fifo)->data); \
+ else \
+ kfree((fifo)->data); \
+ (fifo)->data = NULL; \
+} while (0)
+
+#define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask)
+#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo) (!fifo_used(fifo))
+#define fifo_full(fifo) (!fifo_free(fifo))
+
+#define fifo_front(fifo) ((fifo)->data[(fifo)->front])
+#define fifo_back(fifo) \
+ ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask)
+
+#define fifo_push_back(fifo, i) \
+({ \
+ bool _r = !fifo_full((fifo)); \
+ if (_r) { \
+ (fifo)->data[(fifo)->back++] = (i); \
+ (fifo)->back &= (fifo)->mask; \
+ } \
+ _r; \
+})
+
+#define fifo_pop_front(fifo, i) \
+({ \
+ bool _r = !fifo_empty((fifo)); \
+ if (_r) { \
+ (i) = (fifo)->data[(fifo)->front++]; \
+ (fifo)->front &= (fifo)->mask; \
+ } \
+ _r; \
+})
+
+#define fifo_push_front(fifo, i) \
+({ \
+ bool _r = !fifo_full((fifo)); \
+ if (_r) { \
+ --(fifo)->front; \
+ (fifo)->front &= (fifo)->mask; \
+ (fifo)->data[(fifo)->front] = (i); \
+ } \
+ _r; \
+})
+
+#define fifo_pop_back(fifo, i) \
+({ \
+ bool _r = !fifo_empty((fifo)); \
+ if (_r) { \
+ --(fifo)->back; \
+ (fifo)->back &= (fifo)->mask; \
+ (i) = (fifo)->data[(fifo)->back] \
+ } \
+ _r; \
+})
+
+#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
+
+#define fifo_swap(l, r) \
+do { \
+ swap((l)->front, (r)->front); \
+ swap((l)->back, (r)->back); \
+ swap((l)->size, (r)->size); \
+ swap((l)->mask, (r)->mask); \
+ swap((l)->data, (r)->data); \
+} while (0)
+
+#define fifo_move(dest, src) \
+do { \
+ typeof(*((dest)->data)) _t; \
+ while (!fifo_full(dest) && \
+ fifo_pop(src, _t)) \
+ fifo_push(dest, _t); \
+} while (0)
+
+/*
+ * Simple array based allocator - preallocates a number of elements and you can
+ * never allocate more than that, also has no locking.
+ *
+ * Handy because if you know you only need a fixed number of elements you don't
+ * have to worry about memory allocation failure, and sometimes a mempool isn't
+ * what you want.
+ *
+ * We treat the free elements as entries in a singly linked list, and the
+ * freelist as a stack - allocating and freeing push and pop off the freelist.
+ */
+
+#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \
+ struct { \
+ type *freelist; \
+ type data[size]; \
+ } name
+
+#define array_alloc(array) \
+({ \
+ typeof((array)->freelist) _ret = (array)->freelist; \
+ \
+ if (_ret) \
+ (array)->freelist = *((typeof((array)->freelist) *) _ret);\
+ \
+ _ret; \
+})
+
+#define array_free(array, ptr) \
+do { \
+ typeof((array)->freelist) _ptr = ptr; \
+ \
+ *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \
+ (array)->freelist = _ptr; \
+} while (0)
+
+#define array_allocator_init(array) \
+do { \
+ typeof((array)->freelist) _i; \
+ \
+ BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \
+ (array)->freelist = NULL; \
+ \
+ for (_i = (array)->data; \
+ _i < (array)->data + ARRAY_SIZE((array)->data); \
+ _i++) \
+ array_free(array, _i); \
+} while (0)
+
+#define array_freelist_empty(array) ((array)->freelist == NULL)
+
+#define ANYSINT_MAX(t) \
+ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int bch_strtoint_h(const char *, int *);
+int bch_strtouint_h(const char *, unsigned int *);
+int bch_strtoll_h(const char *, long long *);
+int bch_strtoull_h(const char *, unsigned long long *);
+
+static inline int bch_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+ return bch_strtoint_h(cp, (int *) res);
+#else
+ return bch_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+ return bch_strtouint_h(cp, (unsigned int *) res);
+#else
+ return bch_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res) \
+ (__builtin_types_compatible_p(typeof(*res), int) \
+ ? bch_strtoint_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), long) \
+ ? bch_strtol_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), long long) \
+ ? bch_strtoll_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), unsigned int) \
+ ? bch_strtouint_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), unsigned long) \
+ ? bch_strtoul_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), unsigned long long)\
+ ? bch_strtoull_h(cp, (void *) res) : -EINVAL)
+
+#define strtoul_safe(cp, var) \
+({ \
+ unsigned long _v; \
+ int _r = kstrtoul(cp, 10, &_v); \
+ if (!_r) \
+ var = _v; \
+ _r; \
+})
+
+#define strtoul_safe_clamp(cp, var, min, max) \
+({ \
+ unsigned long _v; \
+ int _r = kstrtoul(cp, 10, &_v); \
+ if (!_r) \
+ var = clamp_t(typeof(var), _v, min, max); \
+ _r; \
+})
+
+#define snprint(buf, size, var) \
+ snprintf(buf, size, \
+ __builtin_types_compatible_p(typeof(var), int) \
+ ? "%i\n" : \
+ __builtin_types_compatible_p(typeof(var), unsigned) \
+ ? "%u\n" : \
+ __builtin_types_compatible_p(typeof(var), long) \
+ ? "%li\n" : \
+ __builtin_types_compatible_p(typeof(var), unsigned long)\
+ ? "%lu\n" : \
+ __builtin_types_compatible_p(typeof(var), int64_t) \
+ ? "%lli\n" : \
+ __builtin_types_compatible_p(typeof(var), uint64_t) \
+ ? "%llu\n" : \
+ __builtin_types_compatible_p(typeof(var), const char *) \
+ ? "%s\n" : "%i\n", var)
+
+ssize_t bch_hprint(char *buf, int64_t v);
+
+bool bch_is_zero(const char *p, size_t n);
+int bch_parse_uuid(const char *s, char *uuid);
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+ size_t selected);
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[]);
+
+struct time_stats {
+ spinlock_t lock;
+ /*
+ * all fields are in nanoseconds, averages are ewmas stored left shifted
+ * by 8
+ */
+ uint64_t max_duration;
+ uint64_t average_duration;
+ uint64_t average_frequency;
+ uint64_t last;
+};
+
+void bch_time_stats_update(struct time_stats *stats, uint64_t time);
+
+static inline unsigned local_clock_us(void)
+{
+ return local_clock() >> 10;
+}
+
+#define NSEC_PER_ns 1L
+#define NSEC_PER_us NSEC_PER_USEC
+#define NSEC_PER_ms NSEC_PER_MSEC
+#define NSEC_PER_sec NSEC_PER_SEC
+
+#define __print_time_stat(stats, name, stat, units) \
+ sysfs_print(name ## _ ## stat ## _ ## units, \
+ div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
+
+#define sysfs_print_time_stats(stats, name, \
+ frequency_units, \
+ duration_units) \
+do { \
+ __print_time_stat(stats, name, \
+ average_frequency, frequency_units); \
+ __print_time_stat(stats, name, \
+ average_duration, duration_units); \
+ sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
+ div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\
+ \
+ sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
+ ? div_s64(local_clock() - (stats)->last, \
+ NSEC_PER_ ## frequency_units) \
+ : -1LL); \
+} while (0)
+
+#define sysfs_time_stats_attribute(name, \
+ frequency_units, \
+ duration_units) \
+read_attribute(name ## _average_frequency_ ## frequency_units); \
+read_attribute(name ## _average_duration_ ## duration_units); \
+read_attribute(name ## _max_duration_ ## duration_units); \
+read_attribute(name ## _last_ ## frequency_units)
+
+#define sysfs_time_stats_attribute_list(name, \
+ frequency_units, \
+ duration_units) \
+&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
+&sysfs_ ## name ## _average_duration_ ## duration_units, \
+&sysfs_ ## name ## _max_duration_ ## duration_units, \
+&sysfs_ ## name ## _last_ ## frequency_units,
+
+#define ewma_add(ewma, val, weight, factor) \
+({ \
+ (ewma) *= (weight) - 1; \
+ (ewma) += (val) << factor; \
+ (ewma) /= (weight); \
+ (ewma) >> factor; \
+})
+
+struct bch_ratelimit {
+ /* Next time we want to do some work, in nanoseconds */
+ uint64_t next;
+
+ /*
+ * Rate at which we want to do work, in units per nanosecond
+ * The units here correspond to the units passed to bch_next_delay()
+ */
+ unsigned rate;
+};
+
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
+{
+ d->next = local_clock();
+}
+
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
+
+#define __DIV_SAFE(n, d, zero) \
+({ \
+ typeof(n) _n = (n); \
+ typeof(d) _d = (d); \
+ _d ? _n / _d : zero; \
+})
+
+#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member) \
+({ \
+ typeof(ptr) _ptr = ptr; \
+ _ptr ? container_of(_ptr, type, member) : NULL; \
+})
+
+#define RB_INSERT(root, new, member, cmp) \
+({ \
+ __label__ dup; \
+ struct rb_node **n = &(root)->rb_node, *parent = NULL; \
+ typeof(new) this; \
+ int res, ret = -1; \
+ \
+ while (*n) { \
+ parent = *n; \
+ this = container_of(*n, typeof(*(new)), member); \
+ res = cmp(new, this); \
+ if (!res) \
+ goto dup; \
+ n = res < 0 \
+ ? &(*n)->rb_left \
+ : &(*n)->rb_right; \
+ } \
+ \
+ rb_link_node(&(new)->member, parent, n); \
+ rb_insert_color(&(new)->member, root); \
+ ret = 0; \
+dup: \
+ ret; \
+})
+
+#define RB_SEARCH(root, search, member, cmp) \
+({ \
+ struct rb_node *n = (root)->rb_node; \
+ typeof(&(search)) this, ret = NULL; \
+ int res; \
+ \
+ while (n) { \
+ this = container_of(n, typeof(search), member); \
+ res = cmp(&(search), this); \
+ if (!res) { \
+ ret = this; \
+ break; \
+ } \
+ n = res < 0 \
+ ? n->rb_left \
+ : n->rb_right; \
+ } \
+ ret; \
+})
+
+#define RB_GREATER(root, search, member, cmp) \
+({ \
+ struct rb_node *n = (root)->rb_node; \
+ typeof(&(search)) this, ret = NULL; \
+ int res; \
+ \
+ while (n) { \
+ this = container_of(n, typeof(search), member); \
+ res = cmp(&(search), this); \
+ if (res < 0) { \
+ ret = this; \
+ n = n->rb_left; \
+ } else \
+ n = n->rb_right; \
+ } \
+ ret; \
+})
+
+#define RB_FIRST(root, type, member) \
+ container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member) \
+ container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member) \
+ container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member) \
+ container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+ unsigned fract = x & ~(~0 << fract_bits);
+
+ x >>= fract_bits;
+ x = 1 << x;
+ x += (x * fract) >> fract_bits;
+
+ return x;
+}
+
+void bch_bio_map(struct bio *bio, void *base);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+ return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl, dev) \
+do { \
+ closure_get(cl); \
+ bch_generic_make_request(bio, &(dev)->bio_split_hook); \
+} while (0)
+
+uint64_t bch_crc64_update(uint64_t, const void *, size_t);
+uint64_t bch_crc64(const void *, size_t);
+
+#endif /* _BCACHE_UTIL_H */
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
new file mode 100644
index 000000000..f1986bcd1
--- /dev/null
+++ b/drivers/md/bcache/writeback.c
@@ -0,0 +1,513 @@
+/*
+ * background writeback - scan btree for dirty data and write it to the backing
+ * device
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "writeback.h"
+
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <trace/events/bcache.h>
+
+/* Rate limiting */
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
+ struct cache_set *c = dc->disk.c;
+ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+ uint64_t cache_dirty_target =
+ div_u64(cache_sectors * dc->writeback_percent, 100);
+
+ int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
+ c->cached_dev_sectors);
+
+ /* PD controller */
+
+ int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
+ int64_t derivative = dirty - dc->disk.sectors_dirty_last;
+ int64_t proportional = dirty - target;
+ int64_t change;
+
+ dc->disk.sectors_dirty_last = dirty;
+
+ /* Scale to sectors per second */
+
+ proportional *= dc->writeback_rate_update_seconds;
+ proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
+
+ derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
+
+ derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
+ (dc->writeback_rate_d_term /
+ dc->writeback_rate_update_seconds) ?: 1, 0);
+
+ derivative *= dc->writeback_rate_d_term;
+ derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
+
+ change = proportional + derivative;
+
+ /* Don't increase writeback rate if the device isn't keeping up */
+ if (change > 0 &&
+ time_after64(local_clock(),
+ dc->writeback_rate.next + NSEC_PER_MSEC))
+ change = 0;
+
+ dc->writeback_rate.rate =
+ clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
+ 1, NSEC_PER_MSEC);
+
+ dc->writeback_rate_proportional = proportional;
+ dc->writeback_rate_derivative = derivative;
+ dc->writeback_rate_change = change;
+ dc->writeback_rate_target = target;
+}
+
+static void update_writeback_rate(struct work_struct *work)
+{
+ struct cached_dev *dc = container_of(to_delayed_work(work),
+ struct cached_dev,
+ writeback_rate_update);
+
+ down_read(&dc->writeback_lock);
+
+ if (atomic_read(&dc->has_dirty) &&
+ dc->writeback_percent)
+ __update_writeback_rate(dc);
+
+ up_read(&dc->writeback_lock);
+
+ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+}
+
+static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+{
+ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+ !dc->writeback_percent)
+ return 0;
+
+ return bch_next_delay(&dc->writeback_rate, sectors);
+}
+
+struct dirty_io {
+ struct closure cl;
+ struct cached_dev *dc;
+ struct bio bio;
+};
+
+static void dirty_init(struct keybuf_key *w)
+{
+ struct dirty_io *io = w->private;
+ struct bio *bio = &io->bio;
+
+ bio_init(bio);
+ if (!io->dc->writeback_percent)
+ bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+ bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
+ bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
+ bio->bi_private = w;
+ bio->bi_io_vec = bio->bi_inline_vecs;
+ bch_bio_map(bio, NULL);
+}
+
+static void dirty_io_destructor(struct closure *cl)
+{
+ struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ kfree(io);
+}
+
+static void write_dirty_finish(struct closure *cl)
+{
+ struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ struct keybuf_key *w = io->bio.bi_private;
+ struct cached_dev *dc = io->dc;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, &io->bio, i)
+ __free_page(bv->bv_page);
+
+ /* This is kind of a dumb way of signalling errors. */
+ if (KEY_DIRTY(&w->key)) {
+ int ret;
+ unsigned i;
+ struct keylist keys;
+
+ bch_keylist_init(&keys);
+
+ bkey_copy(keys.top, &w->key);
+ SET_KEY_DIRTY(keys.top, false);
+ bch_keylist_push(&keys);
+
+ for (i = 0; i < KEY_PTRS(&w->key); i++)
+ atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
+
+ ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
+
+ if (ret)
+ trace_bcache_writeback_collision(&w->key);
+
+ atomic_long_inc(ret
+ ? &dc->disk.c->writeback_keys_failed
+ : &dc->disk.c->writeback_keys_done);
+ }
+
+ bch_keybuf_del(&dc->writeback_keys, w);
+ up(&dc->in_flight);
+
+ closure_return_with_destructor(cl, dirty_io_destructor);
+}
+
+static void dirty_endio(struct bio *bio, int error)
+{
+ struct keybuf_key *w = bio->bi_private;
+ struct dirty_io *io = w->private;
+
+ if (error)
+ SET_KEY_DIRTY(&w->key, false);
+
+ closure_put(&io->cl);
+}
+
+static void write_dirty(struct closure *cl)
+{
+ struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ struct keybuf_key *w = io->bio.bi_private;
+
+ dirty_init(w);
+ io->bio.bi_rw = WRITE;
+ io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+ io->bio.bi_bdev = io->dc->bdev;
+ io->bio.bi_end_io = dirty_endio;
+
+ closure_bio_submit(&io->bio, cl, &io->dc->disk);
+
+ continue_at(cl, write_dirty_finish, system_wq);
+}
+
+static void read_dirty_endio(struct bio *bio, int error)
+{
+ struct keybuf_key *w = bio->bi_private;
+ struct dirty_io *io = w->private;
+
+ bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
+ error, "reading dirty data from cache");
+
+ dirty_endio(bio, error);
+}
+
+static void read_dirty_submit(struct closure *cl)
+{
+ struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+ closure_bio_submit(&io->bio, cl, &io->dc->disk);
+
+ continue_at(cl, write_dirty, system_wq);
+}
+
+static void read_dirty(struct cached_dev *dc)
+{
+ unsigned delay = 0;
+ struct keybuf_key *w;
+ struct dirty_io *io;
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ /*
+ * XXX: if we error, background writeback just spins. Should use some
+ * mempools.
+ */
+
+ while (!kthread_should_stop()) {
+ try_to_freeze();
+
+ w = bch_keybuf_next(&dc->writeback_keys);
+ if (!w)
+ break;
+
+ BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
+
+ if (KEY_START(&w->key) != dc->last_read ||
+ jiffies_to_msecs(delay) > 50)
+ while (!kthread_should_stop() && delay)
+ delay = schedule_timeout_interruptible(delay);
+
+ dc->last_read = KEY_OFFSET(&w->key);
+
+ io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
+ * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
+ if (!io)
+ goto err;
+
+ w->private = io;
+ io->dc = dc;
+
+ dirty_init(w);
+ io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+ io->bio.bi_bdev = PTR_CACHE(dc->disk.c,
+ &w->key, 0)->bdev;
+ io->bio.bi_rw = READ;
+ io->bio.bi_end_io = read_dirty_endio;
+
+ if (bio_alloc_pages(&io->bio, GFP_KERNEL))
+ goto err_free;
+
+ trace_bcache_writeback(&w->key);
+
+ down(&dc->in_flight);
+ closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+
+ delay = writeback_delay(dc, KEY_SIZE(&w->key));
+ }
+
+ if (0) {
+err_free:
+ kfree(w->private);
+err:
+ bch_keybuf_del(&dc->writeback_keys, w);
+ }
+
+ /*
+ * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+ * freed) before refilling again
+ */
+ closure_sync(&cl);
+}
+
+/* Scan for dirty data */
+
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+ uint64_t offset, int nr_sectors)
+{
+ struct bcache_device *d = c->devices[inode];
+ unsigned stripe_offset, stripe, sectors_dirty;
+
+ if (!d)
+ return;
+
+ stripe = offset_to_stripe(d, offset);
+ stripe_offset = offset & (d->stripe_size - 1);
+
+ while (nr_sectors) {
+ int s = min_t(unsigned, abs(nr_sectors),
+ d->stripe_size - stripe_offset);
+
+ if (nr_sectors < 0)
+ s = -s;
+
+ if (stripe >= d->nr_stripes)
+ return;
+
+ sectors_dirty = atomic_add_return(s,
+ d->stripe_sectors_dirty + stripe);
+ if (sectors_dirty == d->stripe_size)
+ set_bit(stripe, d->full_dirty_stripes);
+ else
+ clear_bit(stripe, d->full_dirty_stripes);
+
+ nr_sectors -= s;
+ stripe_offset = 0;
+ stripe++;
+ }
+}
+
+static bool dirty_pred(struct keybuf *buf, struct bkey *k)
+{
+ return KEY_DIRTY(k);
+}
+
+static void refill_full_stripes(struct cached_dev *dc)
+{
+ struct keybuf *buf = &dc->writeback_keys;
+ unsigned start_stripe, stripe, next_stripe;
+ bool wrapped = false;
+
+ stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
+
+ if (stripe >= dc->disk.nr_stripes)
+ stripe = 0;
+
+ start_stripe = stripe;
+
+ while (1) {
+ stripe = find_next_bit(dc->disk.full_dirty_stripes,
+ dc->disk.nr_stripes, stripe);
+
+ if (stripe == dc->disk.nr_stripes)
+ goto next;
+
+ next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
+ dc->disk.nr_stripes, stripe);
+
+ buf->last_scanned = KEY(dc->disk.id,
+ stripe * dc->disk.stripe_size, 0);
+
+ bch_refill_keybuf(dc->disk.c, buf,
+ &KEY(dc->disk.id,
+ next_stripe * dc->disk.stripe_size, 0),
+ dirty_pred);
+
+ if (array_freelist_empty(&buf->freelist))
+ return;
+
+ stripe = next_stripe;
+next:
+ if (wrapped && stripe > start_stripe)
+ return;
+
+ if (stripe == dc->disk.nr_stripes) {
+ stripe = 0;
+ wrapped = true;
+ }
+ }
+}
+
+static bool refill_dirty(struct cached_dev *dc)
+{
+ struct keybuf *buf = &dc->writeback_keys;
+ struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
+ bool searched_from_start = false;
+
+ if (dc->partial_stripes_expensive) {
+ refill_full_stripes(dc);
+ if (array_freelist_empty(&buf->freelist))
+ return false;
+ }
+
+ if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
+ buf->last_scanned = KEY(dc->disk.id, 0, 0);
+ searched_from_start = true;
+ }
+
+ bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
+
+ return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+}
+
+static int bch_writeback_thread(void *arg)
+{
+ struct cached_dev *dc = arg;
+ bool searched_full_index;
+
+ while (!kthread_should_stop()) {
+ down_write(&dc->writeback_lock);
+ if (!atomic_read(&dc->has_dirty) ||
+ (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
+ !dc->writeback_running)) {
+ up_write(&dc->writeback_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (kthread_should_stop())
+ return 0;
+
+ try_to_freeze();
+ schedule();
+ continue;
+ }
+
+ searched_full_index = refill_dirty(dc);
+
+ if (searched_full_index &&
+ RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
+ atomic_set(&dc->has_dirty, 0);
+ cached_dev_put(dc);
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+ bch_write_bdev_super(dc, NULL);
+ }
+
+ up_write(&dc->writeback_lock);
+
+ bch_ratelimit_reset(&dc->writeback_rate);
+ read_dirty(dc);
+
+ if (searched_full_index) {
+ unsigned delay = dc->writeback_delay * HZ;
+
+ while (delay &&
+ !kthread_should_stop() &&
+ !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+ delay = schedule_timeout_interruptible(delay);
+ }
+ }
+
+ return 0;
+}
+
+/* Init */
+
+struct sectors_dirty_init {
+ struct btree_op op;
+ unsigned inode;
+};
+
+static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
+ struct bkey *k)
+{
+ struct sectors_dirty_init *op = container_of(_op,
+ struct sectors_dirty_init, op);
+ if (KEY_INODE(k) > op->inode)
+ return MAP_DONE;
+
+ if (KEY_DIRTY(k))
+ bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+ KEY_START(k), KEY_SIZE(k));
+
+ return MAP_CONTINUE;
+}
+
+void bch_sectors_dirty_init(struct cached_dev *dc)
+{
+ struct sectors_dirty_init op;
+
+ bch_btree_op_init(&op.op, -1);
+ op.inode = dc->disk.id;
+
+ bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+ sectors_dirty_init_fn, 0);
+
+ dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
+}
+
+void bch_cached_dev_writeback_init(struct cached_dev *dc)
+{
+ sema_init(&dc->in_flight, 64);
+ init_rwsem(&dc->writeback_lock);
+ bch_keybuf_init(&dc->writeback_keys);
+
+ dc->writeback_metadata = true;
+ dc->writeback_running = true;
+ dc->writeback_percent = 10;
+ dc->writeback_delay = 30;
+ dc->writeback_rate.rate = 1024;
+
+ dc->writeback_rate_update_seconds = 5;
+ dc->writeback_rate_d_term = 30;
+ dc->writeback_rate_p_term_inverse = 6000;
+
+ INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+}
+
+int bch_cached_dev_writeback_start(struct cached_dev *dc)
+{
+ dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
+ "bcache_writeback");
+ if (IS_ERR(dc->writeback_thread))
+ return PTR_ERR(dc->writeback_thread);
+
+ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+
+ bch_writeback_queue(dc);
+
+ return 0;
+}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
new file mode 100644
index 000000000..0a9dab187
--- /dev/null
+++ b/drivers/md/bcache/writeback.h
@@ -0,0 +1,91 @@
+#ifndef _BCACHE_WRITEBACK_H
+#define _BCACHE_WRITEBACK_H
+
+#define CUTOFF_WRITEBACK 40
+#define CUTOFF_WRITEBACK_SYNC 70
+
+static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
+{
+ uint64_t i, ret = 0;
+
+ for (i = 0; i < d->nr_stripes; i++)
+ ret += atomic_read(d->stripe_sectors_dirty + i);
+
+ return ret;
+}
+
+static inline unsigned offset_to_stripe(struct bcache_device *d,
+ uint64_t offset)
+{
+ do_div(offset, d->stripe_size);
+ return offset;
+}
+
+static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
+ uint64_t offset,
+ unsigned nr_sectors)
+{
+ unsigned stripe = offset_to_stripe(&dc->disk, offset);
+
+ while (1) {
+ if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
+ return true;
+
+ if (nr_sectors <= dc->disk.stripe_size)
+ return false;
+
+ nr_sectors -= dc->disk.stripe_size;
+ stripe++;
+ }
+}
+
+static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
+ unsigned cache_mode, bool would_skip)
+{
+ unsigned in_use = dc->disk.c->gc_stats.in_use;
+
+ if (cache_mode != CACHE_MODE_WRITEBACK ||
+ test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
+ in_use > CUTOFF_WRITEBACK_SYNC)
+ return false;
+
+ if (dc->partial_stripes_expensive &&
+ bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
+ bio_sectors(bio)))
+ return true;
+
+ if (would_skip)
+ return false;
+
+ return bio->bi_rw & REQ_SYNC ||
+ in_use <= CUTOFF_WRITEBACK;
+}
+
+static inline void bch_writeback_queue(struct cached_dev *dc)
+{
+ wake_up_process(dc->writeback_thread);
+}
+
+static inline void bch_writeback_add(struct cached_dev *dc)
+{
+ if (!atomic_read(&dc->has_dirty) &&
+ !atomic_xchg(&dc->has_dirty, 1)) {
+ atomic_inc(&dc->count);
+
+ if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+ /* XXX: should do this synchronously */
+ bch_write_bdev_super(dc, NULL);
+ }
+
+ bch_writeback_queue(dc);
+ }
+}
+
+void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
+
+void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_start(struct cached_dev *);
+
+#endif
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
new file mode 100644
index 000000000..a2ed982b0
--- /dev/null
+++ b/drivers/md/bitmap.c
@@ -0,0 +1,2442 @@
+/*
+ * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * bitmap_create - sets up the bitmap structure
+ * bitmap_destroy - destroys the bitmap structure
+ *
+ * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
+ * - added disk storage for bitmap
+ * - changes to allow various bitmap chunk sizes
+ */
+
+/*
+ * Still to do:
+ *
+ * flush after percent set rather than just time based. (maybe both).
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "bitmap.h"
+
+static inline char *bmname(struct bitmap *bitmap)
+{
+ return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
+}
+
+/*
+ * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
+ *
+ * 1) check to see if this page is allocated, if it's not then try to alloc
+ * 2) if the alloc fails, set the page's hijacked flag so we'll use the
+ * page pointer directly as a counter
+ *
+ * if we find our page, we increment the page's refcount so that it stays
+ * allocated while we're using it
+ */
+static int bitmap_checkpage(struct bitmap_counts *bitmap,
+ unsigned long page, int create)
+__releases(bitmap->lock)
+__acquires(bitmap->lock)
+{
+ unsigned char *mappage;
+
+ if (page >= bitmap->pages) {
+ /* This can happen if bitmap_start_sync goes beyond
+ * End-of-device while looking for a whole page.
+ * It is harmless.
+ */
+ return -EINVAL;
+ }
+
+ if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
+ return 0;
+
+ if (bitmap->bp[page].map) /* page is already allocated, just return */
+ return 0;
+
+ if (!create)
+ return -ENOENT;
+
+ /* this page has not been allocated yet */
+
+ spin_unlock_irq(&bitmap->lock);
+ /* It is possible that this is being called inside a
+ * prepare_to_wait/finish_wait loop from raid5c:make_request().
+ * In general it is not permitted to sleep in that context as it
+ * can cause the loop to spin freely.
+ * That doesn't apply here as we can only reach this point
+ * once with any loop.
+ * When this function completes, either bp[page].map or
+ * bp[page].hijacked. In either case, this function will
+ * abort before getting to this point again. So there is
+ * no risk of a free-spin, and so it is safe to assert
+ * that sleeping here is allowed.
+ */
+ sched_annotate_sleep();
+ mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
+ spin_lock_irq(&bitmap->lock);
+
+ if (mappage == NULL) {
+ pr_debug("md/bitmap: map page allocation failed, hijacking\n");
+ /* failed - set the hijacked flag so that we can use the
+ * pointer as a counter */
+ if (!bitmap->bp[page].map)
+ bitmap->bp[page].hijacked = 1;
+ } else if (bitmap->bp[page].map ||
+ bitmap->bp[page].hijacked) {
+ /* somebody beat us to getting the page */
+ kfree(mappage);
+ return 0;
+ } else {
+
+ /* no page was in place and we have one, so install it */
+
+ bitmap->bp[page].map = mappage;
+ bitmap->missing_pages--;
+ }
+ return 0;
+}
+
+/* if page is completely empty, put it back on the free list, or dealloc it */
+/* if page was hijacked, unmark the flag so it might get alloced next time */
+/* Note: lock should be held when calling this */
+static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
+{
+ char *ptr;
+
+ if (bitmap->bp[page].count) /* page is still busy */
+ return;
+
+ /* page is no longer in use, it can be released */
+
+ if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
+ bitmap->bp[page].hijacked = 0;
+ bitmap->bp[page].map = NULL;
+ } else {
+ /* normal case, free the page */
+ ptr = bitmap->bp[page].map;
+ bitmap->bp[page].map = NULL;
+ bitmap->missing_pages++;
+ kfree(ptr);
+ }
+}
+
+/*
+ * bitmap file handling - read and write the bitmap file and its superblock
+ */
+
+/*
+ * basic page I/O operations
+ */
+
+/* IO operations when bitmap is stored near all superblocks */
+static int read_sb_page(struct mddev *mddev, loff_t offset,
+ struct page *page,
+ unsigned long index, int size)
+{
+ /* choose a good rdev and read the page from there */
+
+ struct md_rdev *rdev;
+ sector_t target;
+
+ rdev_for_each(rdev, mddev) {
+ if (! test_bit(In_sync, &rdev->flags)
+ || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ target = offset + index * (PAGE_SIZE/512);
+
+ if (sync_page_io(rdev, target,
+ roundup(size, bdev_logical_block_size(rdev->bdev)),
+ page, READ, true)) {
+ page->index = index;
+ return 0;
+ }
+ }
+ return -EIO;
+}
+
+static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
+{
+ /* Iterate the disks of an mddev, using rcu to protect access to the
+ * linked list, and raising the refcount of devices we return to ensure
+ * they don't disappear while in use.
+ * As devices are only added or removed when raid_disk is < 0 and
+ * nr_pending is 0 and In_sync is clear, the entries we return will
+ * still be in the same position on the list when we re-enter
+ * list_for_each_entry_continue_rcu.
+ *
+ * Note that if entered with 'rdev == NULL' to start at the
+ * beginning, we temporarily assign 'rdev' to an address which
+ * isn't really an rdev, but which can be used by
+ * list_for_each_entry_continue_rcu() to find the first entry.
+ */
+ rcu_read_lock();
+ if (rdev == NULL)
+ /* start at the beginning */
+ rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
+ else {
+ /* release the previous rdev and start from there. */
+ rdev_dec_pending(rdev, mddev);
+ }
+ list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) {
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Faulty, &rdev->flags)) {
+ /* this is a usable devices */
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ return rdev;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+ struct md_rdev *rdev = NULL;
+ struct block_device *bdev;
+ struct mddev *mddev = bitmap->mddev;
+ struct bitmap_storage *store = &bitmap->storage;
+ int node_offset = 0;
+
+ if (mddev_is_clustered(bitmap->mddev))
+ node_offset = bitmap->cluster_slot * store->file_pages;
+
+ while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
+ int size = PAGE_SIZE;
+ loff_t offset = mddev->bitmap_info.offset;
+
+ bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+
+ if (page->index == store->file_pages-1) {
+ int last_page_size = store->bytes & (PAGE_SIZE-1);
+ if (last_page_size == 0)
+ last_page_size = PAGE_SIZE;
+ size = roundup(last_page_size,
+ bdev_logical_block_size(bdev));
+ }
+ /* Just make sure we aren't corrupting data or
+ * metadata
+ */
+ if (mddev->external) {
+ /* Bitmap could be anywhere. */
+ if (rdev->sb_start + offset + (page->index
+ * (PAGE_SIZE/512))
+ > rdev->data_offset
+ &&
+ rdev->sb_start + offset
+ < (rdev->data_offset + mddev->dev_sectors
+ + (PAGE_SIZE/512)))
+ goto bad_alignment;
+ } else if (offset < 0) {
+ /* DATA BITMAP METADATA */
+ if (offset
+ + (long)(page->index * (PAGE_SIZE/512))
+ + size/512 > 0)
+ /* bitmap runs in to metadata */
+ goto bad_alignment;
+ if (rdev->data_offset + mddev->dev_sectors
+ > rdev->sb_start + offset)
+ /* data runs in to bitmap */
+ goto bad_alignment;
+ } else if (rdev->sb_start < rdev->data_offset) {
+ /* METADATA BITMAP DATA */
+ if (rdev->sb_start
+ + offset
+ + page->index*(PAGE_SIZE/512) + size/512
+ > rdev->data_offset)
+ /* bitmap runs in to data */
+ goto bad_alignment;
+ } else {
+ /* DATA METADATA BITMAP - no problems */
+ }
+ md_super_write(mddev, rdev,
+ rdev->sb_start + offset
+ + page->index * (PAGE_SIZE/512),
+ size,
+ page);
+ }
+
+ if (wait)
+ md_super_wait(mddev);
+ return 0;
+
+ bad_alignment:
+ return -EINVAL;
+}
+
+static void bitmap_file_kick(struct bitmap *bitmap);
+/*
+ * write out a page to a file
+ */
+static void write_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+ struct buffer_head *bh;
+
+ if (bitmap->storage.file == NULL) {
+ switch (write_sb_page(bitmap, page, wait)) {
+ case -EINVAL:
+ set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
+ }
+ } else {
+
+ bh = page_buffers(page);
+
+ while (bh && bh->b_blocknr) {
+ atomic_inc(&bitmap->pending_writes);
+ set_buffer_locked(bh);
+ set_buffer_mapped(bh);
+ submit_bh(WRITE | REQ_SYNC, bh);
+ bh = bh->b_this_page;
+ }
+
+ if (wait)
+ wait_event(bitmap->write_wait,
+ atomic_read(&bitmap->pending_writes)==0);
+ }
+ if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+ bitmap_file_kick(bitmap);
+}
+
+static void end_bitmap_write(struct buffer_head *bh, int uptodate)
+{
+ struct bitmap *bitmap = bh->b_private;
+
+ if (!uptodate)
+ set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
+ if (atomic_dec_and_test(&bitmap->pending_writes))
+ wake_up(&bitmap->write_wait);
+}
+
+/* copied from buffer.c */
+static void
+__clear_page_buffers(struct page *page)
+{
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+}
+static void free_buffers(struct page *page)
+{
+ struct buffer_head *bh;
+
+ if (!PagePrivate(page))
+ return;
+
+ bh = page_buffers(page);
+ while (bh) {
+ struct buffer_head *next = bh->b_this_page;
+ free_buffer_head(bh);
+ bh = next;
+ }
+ __clear_page_buffers(page);
+ put_page(page);
+}
+
+/* read a page from a file.
+ * We both read the page, and attach buffers to the page to record the
+ * address of each block (using bmap). These addresses will be used
+ * to write the block later, completely bypassing the filesystem.
+ * This usage is similar to how swap files are handled, and allows us
+ * to write to a file with no concerns of memory allocation failing.
+ */
+static int read_page(struct file *file, unsigned long index,
+ struct bitmap *bitmap,
+ unsigned long count,
+ struct page *page)
+{
+ int ret = 0;
+ struct inode *inode = file_inode(file);
+ struct buffer_head *bh;
+ sector_t block;
+
+ pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
+ (unsigned long long)index << PAGE_SHIFT);
+
+ bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+ if (!bh) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ attach_page_buffers(page, bh);
+ block = index << (PAGE_SHIFT - inode->i_blkbits);
+ while (bh) {
+ if (count == 0)
+ bh->b_blocknr = 0;
+ else {
+ bh->b_blocknr = bmap(inode, block);
+ if (bh->b_blocknr == 0) {
+ /* Cannot use this file! */
+ ret = -EINVAL;
+ goto out;
+ }
+ bh->b_bdev = inode->i_sb->s_bdev;
+ if (count < (1<<inode->i_blkbits))
+ count = 0;
+ else
+ count -= (1<<inode->i_blkbits);
+
+ bh->b_end_io = end_bitmap_write;
+ bh->b_private = bitmap;
+ atomic_inc(&bitmap->pending_writes);
+ set_buffer_locked(bh);
+ set_buffer_mapped(bh);
+ submit_bh(READ, bh);
+ }
+ block++;
+ bh = bh->b_this_page;
+ }
+ page->index = index;
+
+ wait_event(bitmap->write_wait,
+ atomic_read(&bitmap->pending_writes)==0);
+ if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+ ret = -EIO;
+out:
+ if (ret)
+ printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n",
+ (int)PAGE_SIZE,
+ (unsigned long long)index << PAGE_SHIFT,
+ ret);
+ return ret;
+}
+
+/*
+ * bitmap file superblock operations
+ */
+
+/* update the event counter and sync the superblock to disk */
+void bitmap_update_sb(struct bitmap *bitmap)
+{
+ bitmap_super_t *sb;
+
+ if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
+ return;
+ if (bitmap->mddev->bitmap_info.external)
+ return;
+ if (!bitmap->storage.sb_page) /* no superblock */
+ return;
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ sb->events = cpu_to_le64(bitmap->mddev->events);
+ if (bitmap->mddev->events < bitmap->events_cleared)
+ /* rocking back to read-only */
+ bitmap->events_cleared = bitmap->mddev->events;
+ sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
+ sb->state = cpu_to_le32(bitmap->flags);
+ /* Just in case these have been changed via sysfs: */
+ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
+ sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
+ /* This might have been changed by a reshape */
+ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
+ sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
+ sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
+ sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
+ bitmap_info.space);
+ kunmap_atomic(sb);
+ write_page(bitmap, bitmap->storage.sb_page, 1);
+}
+
+/* print out the bitmap file superblock */
+void bitmap_print_sb(struct bitmap *bitmap)
+{
+ bitmap_super_t *sb;
+
+ if (!bitmap || !bitmap->storage.sb_page)
+ return;
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
+ printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
+ printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
+ printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n",
+ *(__u32 *)(sb->uuid+0),
+ *(__u32 *)(sb->uuid+4),
+ *(__u32 *)(sb->uuid+8),
+ *(__u32 *)(sb->uuid+12));
+ printk(KERN_DEBUG " events: %llu\n",
+ (unsigned long long) le64_to_cpu(sb->events));
+ printk(KERN_DEBUG "events cleared: %llu\n",
+ (unsigned long long) le64_to_cpu(sb->events_cleared));
+ printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state));
+ printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize));
+ printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
+ printk(KERN_DEBUG " sync size: %llu KB\n",
+ (unsigned long long)le64_to_cpu(sb->sync_size)/2);
+ printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
+ kunmap_atomic(sb);
+}
+
+/*
+ * bitmap_new_disk_sb
+ * @bitmap
+ *
+ * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb
+ * reads and verifies the on-disk bitmap superblock and populates bitmap_info.
+ * This function verifies 'bitmap_info' and populates the on-disk bitmap
+ * structure, which is to be written to disk.
+ *
+ * Returns: 0 on success, -Exxx on error
+ */
+static int bitmap_new_disk_sb(struct bitmap *bitmap)
+{
+ bitmap_super_t *sb;
+ unsigned long chunksize, daemon_sleep, write_behind;
+
+ bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (bitmap->storage.sb_page == NULL)
+ return -ENOMEM;
+ bitmap->storage.sb_page->index = 0;
+
+ sb = kmap_atomic(bitmap->storage.sb_page);
+
+ sb->magic = cpu_to_le32(BITMAP_MAGIC);
+ sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
+
+ chunksize = bitmap->mddev->bitmap_info.chunksize;
+ BUG_ON(!chunksize);
+ if (!is_power_of_2(chunksize)) {
+ kunmap_atomic(sb);
+ printk(KERN_ERR "bitmap chunksize not a power of 2\n");
+ return -EINVAL;
+ }
+ sb->chunksize = cpu_to_le32(chunksize);
+
+ daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
+ if (!daemon_sleep ||
+ (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
+ printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n");
+ daemon_sleep = 5 * HZ;
+ }
+ sb->daemon_sleep = cpu_to_le32(daemon_sleep);
+ bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
+
+ /*
+ * FIXME: write_behind for RAID1. If not specified, what
+ * is a good choice? We choose COUNTER_MAX / 2 arbitrarily.
+ */
+ write_behind = bitmap->mddev->bitmap_info.max_write_behind;
+ if (write_behind > COUNTER_MAX)
+ write_behind = COUNTER_MAX / 2;
+ sb->write_behind = cpu_to_le32(write_behind);
+ bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+
+ /* keep the array size field of the bitmap superblock up to date */
+ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
+
+ memcpy(sb->uuid, bitmap->mddev->uuid, 16);
+
+ set_bit(BITMAP_STALE, &bitmap->flags);
+ sb->state = cpu_to_le32(bitmap->flags);
+ bitmap->events_cleared = bitmap->mddev->events;
+ sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
+ bitmap->mddev->bitmap_info.nodes = 0;
+
+ kunmap_atomic(sb);
+
+ return 0;
+}
+
+/* read the superblock from the bitmap file and initialize some bitmap fields */
+static int bitmap_read_sb(struct bitmap *bitmap)
+{
+ char *reason = NULL;
+ bitmap_super_t *sb;
+ unsigned long chunksize, daemon_sleep, write_behind;
+ unsigned long long events;
+ int nodes = 0;
+ unsigned long sectors_reserved = 0;
+ int err = -EINVAL;
+ struct page *sb_page;
+
+ if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
+ chunksize = 128 * 1024 * 1024;
+ daemon_sleep = 5 * HZ;
+ write_behind = 0;
+ set_bit(BITMAP_STALE, &bitmap->flags);
+ err = 0;
+ goto out_no_sb;
+ }
+ /* page 0 is the superblock, read it... */
+ sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!sb_page)
+ return -ENOMEM;
+ bitmap->storage.sb_page = sb_page;
+
+re_read:
+ /* If cluster_slot is set, the cluster is setup */
+ if (bitmap->cluster_slot >= 0) {
+ sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
+
+ sector_div(bm_blocks,
+ bitmap->mddev->bitmap_info.chunksize >> 9);
+ /* bits to bytes */
+ bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
+ /* to 4k blocks */
+ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
+ bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
+ pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
+ bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
+ }
+
+ if (bitmap->storage.file) {
+ loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
+ int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
+
+ err = read_page(bitmap->storage.file, 0,
+ bitmap, bytes, sb_page);
+ } else {
+ err = read_sb_page(bitmap->mddev,
+ bitmap->mddev->bitmap_info.offset,
+ sb_page,
+ 0, sizeof(bitmap_super_t));
+ }
+ if (err)
+ return err;
+
+ err = -EINVAL;
+ sb = kmap_atomic(sb_page);
+
+ chunksize = le32_to_cpu(sb->chunksize);
+ daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
+ write_behind = le32_to_cpu(sb->write_behind);
+ sectors_reserved = le32_to_cpu(sb->sectors_reserved);
+ /* XXX: This is a hack to ensure that we don't use clustering
+ * in case:
+ * - dm-raid is in use and
+ * - the nodes written in bitmap_sb is erroneous.
+ */
+ if (!bitmap->mddev->sync_super) {
+ nodes = le32_to_cpu(sb->nodes);
+ strlcpy(bitmap->mddev->bitmap_info.cluster_name,
+ sb->cluster_name, 64);
+ }
+
+ /* verify that the bitmap-specific fields are valid */
+ if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
+ reason = "bad magic";
+ else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
+ le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
+ reason = "unrecognized superblock version";
+ else if (chunksize < 512)
+ reason = "bitmap chunksize too small";
+ else if (!is_power_of_2(chunksize))
+ reason = "bitmap chunksize not a power of 2";
+ else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
+ reason = "daemon sleep period out of range";
+ else if (write_behind > COUNTER_MAX)
+ reason = "write-behind limit out of range (0 - 16383)";
+ if (reason) {
+ printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
+ bmname(bitmap), reason);
+ goto out;
+ }
+
+ /* keep the array size field of the bitmap superblock up to date */
+ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
+
+ if (bitmap->mddev->persistent) {
+ /*
+ * We have a persistent array superblock, so compare the
+ * bitmap's UUID and event counter to the mddev's
+ */
+ if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
+ printk(KERN_INFO
+ "%s: bitmap superblock UUID mismatch\n",
+ bmname(bitmap));
+ goto out;
+ }
+ events = le64_to_cpu(sb->events);
+ if (err == 0 && !nodes && (events < bitmap->mddev->events)) {
+ printk(KERN_INFO
+ "%s: bitmap file is out of date (%llu < %llu) "
+ "-- forcing full recovery\n",
+ bmname(bitmap), events,
+ (unsigned long long) bitmap->mddev->events);
+ set_bit(BITMAP_STALE, &bitmap->flags);
+ }
+ }
+
+ /* assign fields using values from superblock */
+ bitmap->flags |= le32_to_cpu(sb->state);
+ if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
+ set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
+ bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
+ strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
+ err = 0;
+
+out:
+ kunmap_atomic(sb);
+ /* Assiging chunksize is required for "re_read" */
+ bitmap->mddev->bitmap_info.chunksize = chunksize;
+ if (nodes && (bitmap->cluster_slot < 0)) {
+ err = md_setup_cluster(bitmap->mddev, nodes);
+ if (err) {
+ pr_err("%s: Could not setup cluster service (%d)\n",
+ bmname(bitmap), err);
+ goto out_no_sb;
+ }
+ bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
+ goto re_read;
+ }
+
+
+out_no_sb:
+ if (test_bit(BITMAP_STALE, &bitmap->flags))
+ bitmap->events_cleared = bitmap->mddev->events;
+ bitmap->mddev->bitmap_info.chunksize = chunksize;
+ bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
+ bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+ bitmap->mddev->bitmap_info.nodes = nodes;
+ if (bitmap->mddev->bitmap_info.space == 0 ||
+ bitmap->mddev->bitmap_info.space > sectors_reserved)
+ bitmap->mddev->bitmap_info.space = sectors_reserved;
+ if (err) {
+ bitmap_print_sb(bitmap);
+ if (bitmap->cluster_slot < 0)
+ md_cluster_stop(bitmap->mddev);
+ }
+ return err;
+}
+
+/*
+ * general bitmap file operations
+ */
+
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
+/* calculate the index of the page that contains this bit */
+static inline unsigned long file_page_index(struct bitmap_storage *store,
+ unsigned long chunk)
+{
+ if (store->sb_page)
+ chunk += sizeof(bitmap_super_t) << 3;
+ return chunk >> PAGE_BIT_SHIFT;
+}
+
+/* calculate the (bit) offset of this bit within a page */
+static inline unsigned long file_page_offset(struct bitmap_storage *store,
+ unsigned long chunk)
+{
+ if (store->sb_page)
+ chunk += sizeof(bitmap_super_t) << 3;
+ return chunk & (PAGE_BITS - 1);
+}
+
+/*
+ * return a pointer to the page in the filemap that contains the given bit
+ *
+ */
+static inline struct page *filemap_get_page(struct bitmap_storage *store,
+ unsigned long chunk)
+{
+ if (file_page_index(store, chunk) >= store->file_pages)
+ return NULL;
+ return store->filemap[file_page_index(store, chunk)];
+}
+
+static int bitmap_storage_alloc(struct bitmap_storage *store,
+ unsigned long chunks, int with_super,
+ int slot_number)
+{
+ int pnum, offset = 0;
+ unsigned long num_pages;
+ unsigned long bytes;
+
+ bytes = DIV_ROUND_UP(chunks, 8);
+ if (with_super)
+ bytes += sizeof(bitmap_super_t);
+
+ num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
+ offset = slot_number * (num_pages - 1);
+
+ store->filemap = kmalloc(sizeof(struct page *)
+ * num_pages, GFP_KERNEL);
+ if (!store->filemap)
+ return -ENOMEM;
+
+ if (with_super && !store->sb_page) {
+ store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (store->sb_page == NULL)
+ return -ENOMEM;
+ }
+
+ pnum = 0;
+ if (store->sb_page) {
+ store->filemap[0] = store->sb_page;
+ pnum = 1;
+ store->sb_page->index = offset;
+ }
+
+ for ( ; pnum < num_pages; pnum++) {
+ store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!store->filemap[pnum]) {
+ store->file_pages = pnum;
+ return -ENOMEM;
+ }
+ store->filemap[pnum]->index = pnum + offset;
+ }
+ store->file_pages = pnum;
+
+ /* We need 4 bits per page, rounded up to a multiple
+ * of sizeof(unsigned long) */
+ store->filemap_attr = kzalloc(
+ roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
+ GFP_KERNEL);
+ if (!store->filemap_attr)
+ return -ENOMEM;
+
+ store->bytes = bytes;
+
+ return 0;
+}
+
+static void bitmap_file_unmap(struct bitmap_storage *store)
+{
+ struct page **map, *sb_page;
+ int pages;
+ struct file *file;
+
+ file = store->file;
+ map = store->filemap;
+ pages = store->file_pages;
+ sb_page = store->sb_page;
+
+ while (pages--)
+ if (map[pages] != sb_page) /* 0 is sb_page, release it below */
+ free_buffers(map[pages]);
+ kfree(map);
+ kfree(store->filemap_attr);
+
+ if (sb_page)
+ free_buffers(sb_page);
+
+ if (file) {
+ struct inode *inode = file_inode(file);
+ invalidate_mapping_pages(inode->i_mapping, 0, -1);
+ fput(file);
+ }
+}
+
+/*
+ * bitmap_file_kick - if an error occurs while manipulating the bitmap file
+ * then it is no longer reliable, so we stop using it and we mark the file
+ * as failed in the superblock
+ */
+static void bitmap_file_kick(struct bitmap *bitmap)
+{
+ char *path, *ptr = NULL;
+
+ if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
+ bitmap_update_sb(bitmap);
+
+ if (bitmap->storage.file) {
+ path = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (path)
+ ptr = d_path(&bitmap->storage.file->f_path,
+ path, PAGE_SIZE);
+
+ printk(KERN_ALERT
+ "%s: kicking failed bitmap file %s from array!\n",
+ bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
+
+ kfree(path);
+ } else
+ printk(KERN_ALERT
+ "%s: disabling internal bitmap due to errors\n",
+ bmname(bitmap));
+ }
+}
+
+enum bitmap_page_attr {
+ BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */
+ BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned.
+ * i.e. counter is 1 or 2. */
+ BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
+};
+
+static inline void set_page_attr(struct bitmap *bitmap, int pnum,
+ enum bitmap_page_attr attr)
+{
+ set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
+}
+
+static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
+ enum bitmap_page_attr attr)
+{
+ clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
+}
+
+static inline int test_page_attr(struct bitmap *bitmap, int pnum,
+ enum bitmap_page_attr attr)
+{
+ return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
+}
+
+static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
+ enum bitmap_page_attr attr)
+{
+ return test_and_clear_bit((pnum<<2) + attr,
+ bitmap->storage.filemap_attr);
+}
+/*
+ * bitmap_file_set_bit -- called before performing a write to the md device
+ * to set (and eventually sync) a particular bit in the bitmap file
+ *
+ * we set the bit immediately, then we record the page number so that
+ * when an unplug occurs, we can flush the dirty pages out to disk
+ */
+static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
+{
+ unsigned long bit;
+ struct page *page;
+ void *kaddr;
+ unsigned long chunk = block >> bitmap->counts.chunkshift;
+
+ page = filemap_get_page(&bitmap->storage, chunk);
+ if (!page)
+ return;
+ bit = file_page_offset(&bitmap->storage, chunk);
+
+ /* set the bit */
+ kaddr = kmap_atomic(page);
+ if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+ set_bit(bit, kaddr);
+ else
+ set_bit_le(bit, kaddr);
+ kunmap_atomic(kaddr);
+ pr_debug("set file bit %lu page %lu\n", bit, page->index);
+ /* record page number so it gets flushed to disk when unplug occurs */
+ set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY);
+}
+
+static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
+{
+ unsigned long bit;
+ struct page *page;
+ void *paddr;
+ unsigned long chunk = block >> bitmap->counts.chunkshift;
+
+ page = filemap_get_page(&bitmap->storage, chunk);
+ if (!page)
+ return;
+ bit = file_page_offset(&bitmap->storage, chunk);
+ paddr = kmap_atomic(page);
+ if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+ clear_bit(bit, paddr);
+ else
+ clear_bit_le(bit, paddr);
+ kunmap_atomic(paddr);
+ if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
+ set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
+ bitmap->allclean = 0;
+ }
+}
+
+static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
+{
+ unsigned long bit;
+ struct page *page;
+ void *paddr;
+ unsigned long chunk = block >> bitmap->counts.chunkshift;
+ int set = 0;
+
+ page = filemap_get_page(&bitmap->storage, chunk);
+ if (!page)
+ return -EINVAL;
+ bit = file_page_offset(&bitmap->storage, chunk);
+ paddr = kmap_atomic(page);
+ if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+ set = test_bit(bit, paddr);
+ else
+ set = test_bit_le(bit, paddr);
+ kunmap_atomic(paddr);
+ return set;
+}
+
+
+/* this gets called when the md device is ready to unplug its underlying
+ * (slave) device queues -- before we let any writes go down, we need to
+ * sync the dirty pages of the bitmap file to disk */
+void bitmap_unplug(struct bitmap *bitmap)
+{
+ unsigned long i;
+ int dirty, need_write;
+
+ if (!bitmap || !bitmap->storage.filemap ||
+ test_bit(BITMAP_STALE, &bitmap->flags))
+ return;
+
+ /* look at each page to see if there are any set bits that need to be
+ * flushed out to disk */
+ for (i = 0; i < bitmap->storage.file_pages; i++) {
+ if (!bitmap->storage.filemap)
+ return;
+ dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+ need_write = test_and_clear_page_attr(bitmap, i,
+ BITMAP_PAGE_NEEDWRITE);
+ if (dirty || need_write) {
+ clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
+ write_page(bitmap, bitmap->storage.filemap[i], 0);
+ }
+ }
+ if (bitmap->storage.file)
+ wait_event(bitmap->write_wait,
+ atomic_read(&bitmap->pending_writes)==0);
+ else
+ md_super_wait(bitmap->mddev);
+
+ if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+ bitmap_file_kick(bitmap);
+}
+EXPORT_SYMBOL(bitmap_unplug);
+
+static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
+/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
+ * the in-memory bitmap from the on-disk bitmap -- also, sets up the
+ * memory mapping of the bitmap file
+ * Special cases:
+ * if there's no bitmap file, or if the bitmap file had been
+ * previously kicked from the array, we mark all the bits as
+ * 1's in order to cause a full resync.
+ *
+ * We ignore all bits for sectors that end earlier than 'start'.
+ * This is used when reading an out-of-date bitmap...
+ */
+static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
+{
+ unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
+ struct page *page = NULL;
+ unsigned long bit_cnt = 0;
+ struct file *file;
+ unsigned long offset;
+ int outofdate;
+ int ret = -ENOSPC;
+ void *paddr;
+ struct bitmap_storage *store = &bitmap->storage;
+
+ chunks = bitmap->counts.chunks;
+ file = store->file;
+
+ if (!file && !bitmap->mddev->bitmap_info.offset) {
+ /* No permanent bitmap - fill with '1s'. */
+ store->filemap = NULL;
+ store->file_pages = 0;
+ for (i = 0; i < chunks ; i++) {
+ /* if the disk bit is set, set the memory bit */
+ int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
+ >= start);
+ bitmap_set_memory_bits(bitmap,
+ (sector_t)i << bitmap->counts.chunkshift,
+ needed);
+ }
+ return 0;
+ }
+
+ outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
+ if (outofdate)
+ printk(KERN_INFO "%s: bitmap file is out of date, doing full "
+ "recovery\n", bmname(bitmap));
+
+ if (file && i_size_read(file->f_mapping->host) < store->bytes) {
+ printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
+ bmname(bitmap),
+ (unsigned long) i_size_read(file->f_mapping->host),
+ store->bytes);
+ goto err;
+ }
+
+ oldindex = ~0L;
+ offset = 0;
+ if (!bitmap->mddev->bitmap_info.external)
+ offset = sizeof(bitmap_super_t);
+
+ if (mddev_is_clustered(bitmap->mddev))
+ node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
+
+ for (i = 0; i < chunks; i++) {
+ int b;
+ index = file_page_index(&bitmap->storage, i);
+ bit = file_page_offset(&bitmap->storage, i);
+ if (index != oldindex) { /* this is a new page, read it in */
+ int count;
+ /* unmap the old page, we're done with it */
+ if (index == store->file_pages-1)
+ count = store->bytes - index * PAGE_SIZE;
+ else
+ count = PAGE_SIZE;
+ page = store->filemap[index];
+ if (file)
+ ret = read_page(file, index, bitmap,
+ count, page);
+ else
+ ret = read_sb_page(
+ bitmap->mddev,
+ bitmap->mddev->bitmap_info.offset,
+ page,
+ index + node_offset, count);
+
+ if (ret)
+ goto err;
+
+ oldindex = index;
+
+ if (outofdate) {
+ /*
+ * if bitmap is out of date, dirty the
+ * whole page and write it out
+ */
+ paddr = kmap_atomic(page);
+ memset(paddr + offset, 0xff,
+ PAGE_SIZE - offset);
+ kunmap_atomic(paddr);
+ write_page(bitmap, page, 1);
+
+ ret = -EIO;
+ if (test_bit(BITMAP_WRITE_ERROR,
+ &bitmap->flags))
+ goto err;
+ }
+ }
+ paddr = kmap_atomic(page);
+ if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+ b = test_bit(bit, paddr);
+ else
+ b = test_bit_le(bit, paddr);
+ kunmap_atomic(paddr);
+ if (b) {
+ /* if the disk bit is set, set the memory bit */
+ int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
+ >= start);
+ bitmap_set_memory_bits(bitmap,
+ (sector_t)i << bitmap->counts.chunkshift,
+ needed);
+ bit_cnt++;
+ }
+ offset = 0;
+ }
+
+ printk(KERN_INFO "%s: bitmap initialized from disk: "
+ "read %lu pages, set %lu of %lu bits\n",
+ bmname(bitmap), store->file_pages,
+ bit_cnt, chunks);
+
+ return 0;
+
+ err:
+ printk(KERN_INFO "%s: bitmap initialisation failed: %d\n",
+ bmname(bitmap), ret);
+ return ret;
+}
+
+void bitmap_write_all(struct bitmap *bitmap)
+{
+ /* We don't actually write all bitmap blocks here,
+ * just flag them as needing to be written
+ */
+ int i;
+
+ if (!bitmap || !bitmap->storage.filemap)
+ return;
+ if (bitmap->storage.file)
+ /* Only one copy, so nothing needed */
+ return;
+
+ for (i = 0; i < bitmap->storage.file_pages; i++)
+ set_page_attr(bitmap, i,
+ BITMAP_PAGE_NEEDWRITE);
+ bitmap->allclean = 0;
+}
+
+static void bitmap_count_page(struct bitmap_counts *bitmap,
+ sector_t offset, int inc)
+{
+ sector_t chunk = offset >> bitmap->chunkshift;
+ unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ bitmap->bp[page].count += inc;
+ bitmap_checkfree(bitmap, page);
+}
+
+static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
+{
+ sector_t chunk = offset >> bitmap->chunkshift;
+ unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ struct bitmap_page *bp = &bitmap->bp[page];
+
+ if (!bp->pending)
+ bp->pending = 1;
+}
+
+static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
+ sector_t offset, sector_t *blocks,
+ int create);
+
+/*
+ * bitmap daemon -- periodically wakes up to clean bits and flush pages
+ * out to disk
+ */
+
+void bitmap_daemon_work(struct mddev *mddev)
+{
+ struct bitmap *bitmap;
+ unsigned long j;
+ unsigned long nextpage;
+ sector_t blocks;
+ struct bitmap_counts *counts;
+
+ /* Use a mutex to guard daemon_work against
+ * bitmap_destroy.
+ */
+ mutex_lock(&mddev->bitmap_info.mutex);
+ bitmap = mddev->bitmap;
+ if (bitmap == NULL) {
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ return;
+ }
+ if (time_before(jiffies, bitmap->daemon_lastrun
+ + mddev->bitmap_info.daemon_sleep))
+ goto done;
+
+ bitmap->daemon_lastrun = jiffies;
+ if (bitmap->allclean) {
+ mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ goto done;
+ }
+ bitmap->allclean = 1;
+
+ /* Any file-page which is PENDING now needs to be written.
+ * So set NEEDWRITE now, then after we make any last-minute changes
+ * we will write it.
+ */
+ for (j = 0; j < bitmap->storage.file_pages; j++)
+ if (test_and_clear_page_attr(bitmap, j,
+ BITMAP_PAGE_PENDING))
+ set_page_attr(bitmap, j,
+ BITMAP_PAGE_NEEDWRITE);
+
+ if (bitmap->need_sync &&
+ mddev->bitmap_info.external == 0) {
+ /* Arrange for superblock update as well as
+ * other changes */
+ bitmap_super_t *sb;
+ bitmap->need_sync = 0;
+ if (bitmap->storage.filemap) {
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ sb->events_cleared =
+ cpu_to_le64(bitmap->events_cleared);
+ kunmap_atomic(sb);
+ set_page_attr(bitmap, 0,
+ BITMAP_PAGE_NEEDWRITE);
+ }
+ }
+ /* Now look at the bitmap counters and if any are '2' or '1',
+ * decrement and handle accordingly.
+ */
+ counts = &bitmap->counts;
+ spin_lock_irq(&counts->lock);
+ nextpage = 0;
+ for (j = 0; j < counts->chunks; j++) {
+ bitmap_counter_t *bmc;
+ sector_t block = (sector_t)j << counts->chunkshift;
+
+ if (j == nextpage) {
+ nextpage += PAGE_COUNTER_RATIO;
+ if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) {
+ j |= PAGE_COUNTER_MASK;
+ continue;
+ }
+ counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
+ }
+ bmc = bitmap_get_counter(counts,
+ block,
+ &blocks, 0);
+
+ if (!bmc) {
+ j |= PAGE_COUNTER_MASK;
+ continue;
+ }
+ if (*bmc == 1 && !bitmap->need_sync) {
+ /* We can clear the bit */
+ *bmc = 0;
+ bitmap_count_page(counts, block, -1);
+ bitmap_file_clear_bit(bitmap, block);
+ } else if (*bmc && *bmc <= 2) {
+ *bmc = 1;
+ bitmap_set_pending(counts, block);
+ bitmap->allclean = 0;
+ }
+ }
+ spin_unlock_irq(&counts->lock);
+
+ /* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
+ * DIRTY pages need to be written by bitmap_unplug so it can wait
+ * for them.
+ * If we find any DIRTY page we stop there and let bitmap_unplug
+ * handle all the rest. This is important in the case where
+ * the first blocking holds the superblock and it has been updated.
+ * We mustn't write any other blocks before the superblock.
+ */
+ for (j = 0;
+ j < bitmap->storage.file_pages
+ && !test_bit(BITMAP_STALE, &bitmap->flags);
+ j++) {
+ if (test_page_attr(bitmap, j,
+ BITMAP_PAGE_DIRTY))
+ /* bitmap_unplug will handle the rest */
+ break;
+ if (test_and_clear_page_attr(bitmap, j,
+ BITMAP_PAGE_NEEDWRITE)) {
+ write_page(bitmap, bitmap->storage.filemap[j], 0);
+ }
+ }
+
+ done:
+ if (bitmap->allclean == 0)
+ mddev->thread->timeout =
+ mddev->bitmap_info.daemon_sleep;
+ mutex_unlock(&mddev->bitmap_info.mutex);
+}
+
+static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
+ sector_t offset, sector_t *blocks,
+ int create)
+__releases(bitmap->lock)
+__acquires(bitmap->lock)
+{
+ /* If 'create', we might release the lock and reclaim it.
+ * The lock must have been taken with interrupts enabled.
+ * If !create, we don't release the lock.
+ */
+ sector_t chunk = offset >> bitmap->chunkshift;
+ unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
+ sector_t csize;
+ int err;
+
+ err = bitmap_checkpage(bitmap, page, create);
+
+ if (bitmap->bp[page].hijacked ||
+ bitmap->bp[page].map == NULL)
+ csize = ((sector_t)1) << (bitmap->chunkshift +
+ PAGE_COUNTER_SHIFT - 1);
+ else
+ csize = ((sector_t)1) << bitmap->chunkshift;
+ *blocks = csize - (offset & (csize - 1));
+
+ if (err < 0)
+ return NULL;
+
+ /* now locked ... */
+
+ if (bitmap->bp[page].hijacked) { /* hijacked pointer */
+ /* should we use the first or second counter field
+ * of the hijacked pointer? */
+ int hi = (pageoff > PAGE_COUNTER_MASK);
+ return &((bitmap_counter_t *)
+ &bitmap->bp[page].map)[hi];
+ } else /* page is allocated */
+ return (bitmap_counter_t *)
+ &(bitmap->bp[page].map[pageoff]);
+}
+
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
+{
+ if (!bitmap)
+ return 0;
+
+ if (behind) {
+ int bw;
+ atomic_inc(&bitmap->behind_writes);
+ bw = atomic_read(&bitmap->behind_writes);
+ if (bw > bitmap->behind_writes_used)
+ bitmap->behind_writes_used = bw;
+
+ pr_debug("inc write-behind count %d/%lu\n",
+ bw, bitmap->mddev->bitmap_info.max_write_behind);
+ }
+
+ while (sectors) {
+ sector_t blocks;
+ bitmap_counter_t *bmc;
+
+ spin_lock_irq(&bitmap->counts.lock);
+ bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
+ if (!bmc) {
+ spin_unlock_irq(&bitmap->counts.lock);
+ return 0;
+ }
+
+ if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
+ DEFINE_WAIT(__wait);
+ /* note that it is safe to do the prepare_to_wait
+ * after the test as long as we do it before dropping
+ * the spinlock.
+ */
+ prepare_to_wait(&bitmap->overflow_wait, &__wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&bitmap->counts.lock);
+ schedule();
+ finish_wait(&bitmap->overflow_wait, &__wait);
+ continue;
+ }
+
+ switch (*bmc) {
+ case 0:
+ bitmap_file_set_bit(bitmap, offset);
+ bitmap_count_page(&bitmap->counts, offset, 1);
+ /* fall through */
+ case 1:
+ *bmc = 2;
+ }
+
+ (*bmc)++;
+
+ spin_unlock_irq(&bitmap->counts.lock);
+
+ offset += blocks;
+ if (sectors > blocks)
+ sectors -= blocks;
+ else
+ sectors = 0;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(bitmap_startwrite);
+
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
+ int success, int behind)
+{
+ if (!bitmap)
+ return;
+ if (behind) {
+ if (atomic_dec_and_test(&bitmap->behind_writes))
+ wake_up(&bitmap->behind_wait);
+ pr_debug("dec write-behind count %d/%lu\n",
+ atomic_read(&bitmap->behind_writes),
+ bitmap->mddev->bitmap_info.max_write_behind);
+ }
+
+ while (sectors) {
+ sector_t blocks;
+ unsigned long flags;
+ bitmap_counter_t *bmc;
+
+ spin_lock_irqsave(&bitmap->counts.lock, flags);
+ bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
+ if (!bmc) {
+ spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+ return;
+ }
+
+ if (success && !bitmap->mddev->degraded &&
+ bitmap->events_cleared < bitmap->mddev->events) {
+ bitmap->events_cleared = bitmap->mddev->events;
+ bitmap->need_sync = 1;
+ sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
+ }
+
+ if (!success && !NEEDED(*bmc))
+ *bmc |= NEEDED_MASK;
+
+ if (COUNTER(*bmc) == COUNTER_MAX)
+ wake_up(&bitmap->overflow_wait);
+
+ (*bmc)--;
+ if (*bmc <= 2) {
+ bitmap_set_pending(&bitmap->counts, offset);
+ bitmap->allclean = 0;
+ }
+ spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+ offset += blocks;
+ if (sectors > blocks)
+ sectors -= blocks;
+ else
+ sectors = 0;
+ }
+}
+EXPORT_SYMBOL(bitmap_endwrite);
+
+static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
+ int degraded)
+{
+ bitmap_counter_t *bmc;
+ int rv;
+ if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
+ *blocks = 1024;
+ return 1; /* always resync if no bitmap */
+ }
+ spin_lock_irq(&bitmap->counts.lock);
+ bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
+ rv = 0;
+ if (bmc) {
+ /* locked */
+ if (RESYNC(*bmc))
+ rv = 1;
+ else if (NEEDED(*bmc)) {
+ rv = 1;
+ if (!degraded) { /* don't set/clear bits if degraded */
+ *bmc |= RESYNC_MASK;
+ *bmc &= ~NEEDED_MASK;
+ }
+ }
+ }
+ spin_unlock_irq(&bitmap->counts.lock);
+ return rv;
+}
+
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
+ int degraded)
+{
+ /* bitmap_start_sync must always report on multiples of whole
+ * pages, otherwise resync (which is very PAGE_SIZE based) will
+ * get confused.
+ * So call __bitmap_start_sync repeatedly (if needed) until
+ * At least PAGE_SIZE>>9 blocks are covered.
+ * Return the 'or' of the result.
+ */
+ int rv = 0;
+ sector_t blocks1;
+
+ *blocks = 0;
+ while (*blocks < (PAGE_SIZE>>9)) {
+ rv |= __bitmap_start_sync(bitmap, offset,
+ &blocks1, degraded);
+ offset += blocks1;
+ *blocks += blocks1;
+ }
+ return rv;
+}
+EXPORT_SYMBOL(bitmap_start_sync);
+
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted)
+{
+ bitmap_counter_t *bmc;
+ unsigned long flags;
+
+ if (bitmap == NULL) {
+ *blocks = 1024;
+ return;
+ }
+ spin_lock_irqsave(&bitmap->counts.lock, flags);
+ bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
+ if (bmc == NULL)
+ goto unlock;
+ /* locked */
+ if (RESYNC(*bmc)) {
+ *bmc &= ~RESYNC_MASK;
+
+ if (!NEEDED(*bmc) && aborted)
+ *bmc |= NEEDED_MASK;
+ else {
+ if (*bmc <= 2) {
+ bitmap_set_pending(&bitmap->counts, offset);
+ bitmap->allclean = 0;
+ }
+ }
+ }
+ unlock:
+ spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+}
+EXPORT_SYMBOL(bitmap_end_sync);
+
+void bitmap_close_sync(struct bitmap *bitmap)
+{
+ /* Sync has finished, and any bitmap chunks that weren't synced
+ * properly have been aborted. It remains to us to clear the
+ * RESYNC bit wherever it is still on
+ */
+ sector_t sector = 0;
+ sector_t blocks;
+ if (!bitmap)
+ return;
+ while (sector < bitmap->mddev->resync_max_sectors) {
+ bitmap_end_sync(bitmap, sector, &blocks, 0);
+ sector += blocks;
+ }
+}
+EXPORT_SYMBOL(bitmap_close_sync);
+
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
+{
+ sector_t s = 0;
+ sector_t blocks;
+
+ if (!bitmap)
+ return;
+ if (sector == 0) {
+ bitmap->last_end_sync = jiffies;
+ return;
+ }
+ if (time_before(jiffies, (bitmap->last_end_sync
+ + bitmap->mddev->bitmap_info.daemon_sleep)))
+ return;
+ wait_event(bitmap->mddev->recovery_wait,
+ atomic_read(&bitmap->mddev->recovery_active) == 0);
+
+ bitmap->mddev->curr_resync_completed = sector;
+ set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
+ sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
+ s = 0;
+ while (s < sector && s < bitmap->mddev->resync_max_sectors) {
+ bitmap_end_sync(bitmap, s, &blocks, 0);
+ s += blocks;
+ }
+ bitmap->last_end_sync = jiffies;
+ sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
+}
+EXPORT_SYMBOL(bitmap_cond_end_sync);
+
+static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
+{
+ /* For each chunk covered by any of these sectors, set the
+ * counter to 2 and possibly set resync_needed. They should all
+ * be 0 at this point
+ */
+
+ sector_t secs;
+ bitmap_counter_t *bmc;
+ spin_lock_irq(&bitmap->counts.lock);
+ bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
+ if (!bmc) {
+ spin_unlock_irq(&bitmap->counts.lock);
+ return;
+ }
+ if (!*bmc) {
+ *bmc = 2;
+ bitmap_count_page(&bitmap->counts, offset, 1);
+ bitmap_set_pending(&bitmap->counts, offset);
+ bitmap->allclean = 0;
+ }
+ if (needed)
+ *bmc |= NEEDED_MASK;
+ spin_unlock_irq(&bitmap->counts.lock);
+}
+
+/* dirty the memory and file bits for bitmap chunks "s" to "e" */
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
+{
+ unsigned long chunk;
+
+ for (chunk = s; chunk <= e; chunk++) {
+ sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
+ bitmap_set_memory_bits(bitmap, sec, 1);
+ bitmap_file_set_bit(bitmap, sec);
+ if (sec < bitmap->mddev->recovery_cp)
+ /* We are asserting that the array is dirty,
+ * so move the recovery_cp address back so
+ * that it is obvious that it is dirty
+ */
+ bitmap->mddev->recovery_cp = sec;
+ }
+}
+
+/*
+ * flush out any pending updates
+ */
+void bitmap_flush(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ long sleep;
+
+ if (!bitmap) /* there was no bitmap */
+ return;
+
+ /* run the daemon_work three time to ensure everything is flushed
+ * that can be
+ */
+ sleep = mddev->bitmap_info.daemon_sleep * 2;
+ bitmap->daemon_lastrun -= sleep;
+ bitmap_daemon_work(mddev);
+ bitmap->daemon_lastrun -= sleep;
+ bitmap_daemon_work(mddev);
+ bitmap->daemon_lastrun -= sleep;
+ bitmap_daemon_work(mddev);
+ bitmap_update_sb(bitmap);
+}
+
+/*
+ * free memory that was allocated
+ */
+static void bitmap_free(struct bitmap *bitmap)
+{
+ unsigned long k, pages;
+ struct bitmap_page *bp;
+
+ if (!bitmap) /* there was no bitmap */
+ return;
+
+ if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
+ bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
+ md_cluster_stop(bitmap->mddev);
+
+ /* Shouldn't be needed - but just in case.... */
+ wait_event(bitmap->write_wait,
+ atomic_read(&bitmap->pending_writes) == 0);
+
+ /* release the bitmap file */
+ bitmap_file_unmap(&bitmap->storage);
+
+ bp = bitmap->counts.bp;
+ pages = bitmap->counts.pages;
+
+ /* free all allocated memory */
+
+ if (bp) /* deallocate the page memory */
+ for (k = 0; k < pages; k++)
+ if (bp[k].map && !bp[k].hijacked)
+ kfree(bp[k].map);
+ kfree(bp);
+ kfree(bitmap);
+}
+
+void bitmap_destroy(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap) /* there was no bitmap */
+ return;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ spin_lock(&mddev->lock);
+ mddev->bitmap = NULL; /* disconnect from the md device */
+ spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+ if (mddev->thread)
+ mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+
+ if (bitmap->sysfs_can_clear)
+ sysfs_put(bitmap->sysfs_can_clear);
+
+ bitmap_free(bitmap);
+}
+
+/*
+ * initialize the bitmap structure
+ * if this returns an error, bitmap_destroy must be called to do clean up
+ */
+struct bitmap *bitmap_create(struct mddev *mddev, int slot)
+{
+ struct bitmap *bitmap;
+ sector_t blocks = mddev->resync_max_sectors;
+ struct file *file = mddev->bitmap_info.file;
+ int err;
+ struct kernfs_node *bm = NULL;
+
+ BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
+
+ BUG_ON(file && mddev->bitmap_info.offset);
+
+ bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
+ if (!bitmap)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&bitmap->counts.lock);
+ atomic_set(&bitmap->pending_writes, 0);
+ init_waitqueue_head(&bitmap->write_wait);
+ init_waitqueue_head(&bitmap->overflow_wait);
+ init_waitqueue_head(&bitmap->behind_wait);
+
+ bitmap->mddev = mddev;
+ bitmap->cluster_slot = slot;
+
+ if (mddev->kobj.sd)
+ bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
+ if (bm) {
+ bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
+ sysfs_put(bm);
+ } else
+ bitmap->sysfs_can_clear = NULL;
+
+ bitmap->storage.file = file;
+ if (file) {
+ get_file(file);
+ /* As future accesses to this file will use bmap,
+ * and bypass the page cache, we must sync the file
+ * first.
+ */
+ vfs_fsync(file, 1);
+ }
+ /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
+ if (!mddev->bitmap_info.external) {
+ /*
+ * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is
+ * instructing us to create a new on-disk bitmap instance.
+ */
+ if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags))
+ err = bitmap_new_disk_sb(bitmap);
+ else
+ err = bitmap_read_sb(bitmap);
+ } else {
+ err = 0;
+ if (mddev->bitmap_info.chunksize == 0 ||
+ mddev->bitmap_info.daemon_sleep == 0)
+ /* chunksize and time_base need to be
+ * set first. */
+ err = -EINVAL;
+ }
+ if (err)
+ goto error;
+
+ bitmap->daemon_lastrun = jiffies;
+ err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
+ if (err)
+ goto error;
+
+ printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
+ bitmap->counts.pages, bmname(bitmap));
+
+ err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+ if (err)
+ goto error;
+
+ return bitmap;
+ error:
+ bitmap_free(bitmap);
+ return ERR_PTR(err);
+}
+
+int bitmap_load(struct mddev *mddev)
+{
+ int err = 0;
+ sector_t start = 0;
+ sector_t sector = 0;
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ goto out;
+
+ /* Clear out old bitmap info first: Either there is none, or we
+ * are resuming after someone else has possibly changed things,
+ * so we should forget old cached info.
+ * All chunks should be clean, but some might need_sync.
+ */
+ while (sector < mddev->resync_max_sectors) {
+ sector_t blocks;
+ bitmap_start_sync(bitmap, sector, &blocks, 0);
+ sector += blocks;
+ }
+ bitmap_close_sync(bitmap);
+
+ if (mddev->degraded == 0
+ || bitmap->events_cleared == mddev->events)
+ /* no need to keep dirty bits to optimise a
+ * re-add of a missing device */
+ start = mddev->recovery_cp;
+
+ mutex_lock(&mddev->bitmap_info.mutex);
+ err = bitmap_init_from_disk(bitmap, start);
+ mutex_unlock(&mddev->bitmap_info.mutex);
+
+ if (err)
+ goto out;
+ clear_bit(BITMAP_STALE, &bitmap->flags);
+
+ /* Kick recovery in case any bits were set */
+ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
+
+ mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
+ md_wakeup_thread(mddev->thread);
+
+ bitmap_update_sb(bitmap);
+
+ if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+ err = -EIO;
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(bitmap_load);
+
+/* Loads the bitmap associated with slot and copies the resync information
+ * to our bitmap
+ */
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+ sector_t *low, sector_t *high, bool clear_bits)
+{
+ int rv = 0, i, j;
+ sector_t block, lo = 0, hi = 0;
+ struct bitmap_counts *counts;
+ struct bitmap *bitmap = bitmap_create(mddev, slot);
+
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ rv = bitmap_read_sb(bitmap);
+ if (rv)
+ goto err;
+
+ rv = bitmap_init_from_disk(bitmap, 0);
+ if (rv)
+ goto err;
+
+ counts = &bitmap->counts;
+ for (j = 0; j < counts->chunks; j++) {
+ block = (sector_t)j << counts->chunkshift;
+ if (bitmap_file_test_bit(bitmap, block)) {
+ if (!lo)
+ lo = block;
+ hi = block;
+ bitmap_file_clear_bit(bitmap, block);
+ bitmap_set_memory_bits(mddev->bitmap, block, 1);
+ bitmap_file_set_bit(mddev->bitmap, block);
+ }
+ }
+
+ if (clear_bits) {
+ bitmap_update_sb(bitmap);
+ /* Setting this for the ev_page should be enough.
+ * And we do not require both write_all and PAGE_DIRT either
+ */
+ for (i = 0; i < bitmap->storage.file_pages; i++)
+ set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+ bitmap_write_all(bitmap);
+ bitmap_unplug(bitmap);
+ }
+ *low = lo;
+ *high = hi;
+err:
+ bitmap_free(bitmap);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
+
+
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
+{
+ unsigned long chunk_kb;
+ struct bitmap_counts *counts;
+
+ if (!bitmap)
+ return;
+
+ counts = &bitmap->counts;
+
+ chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
+ seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+ "%lu%s chunk",
+ counts->pages - counts->missing_pages,
+ counts->pages,
+ (counts->pages - counts->missing_pages)
+ << (PAGE_SHIFT - 10),
+ chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
+ chunk_kb ? "KB" : "B");
+ if (bitmap->storage.file) {
+ seq_printf(seq, ", file: ");
+ seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
+ }
+
+ seq_printf(seq, "\n");
+}
+
+int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+ int chunksize, int init)
+{
+ /* If chunk_size is 0, choose an appropriate chunk size.
+ * Then possibly allocate new storage space.
+ * Then quiesce, copy bits, replace bitmap, and re-start
+ *
+ * This function is called both to set up the initial bitmap
+ * and to resize the bitmap while the array is active.
+ * If this happens as a result of the array being resized,
+ * chunksize will be zero, and we need to choose a suitable
+ * chunksize, otherwise we use what we are given.
+ */
+ struct bitmap_storage store;
+ struct bitmap_counts old_counts;
+ unsigned long chunks;
+ sector_t block;
+ sector_t old_blocks, new_blocks;
+ int chunkshift;
+ int ret = 0;
+ long pages;
+ struct bitmap_page *new_bp;
+
+ if (chunksize == 0) {
+ /* If there is enough space, leave the chunk size unchanged,
+ * else increase by factor of two until there is enough space.
+ */
+ long bytes;
+ long space = bitmap->mddev->bitmap_info.space;
+
+ if (space == 0) {
+ /* We don't know how much space there is, so limit
+ * to current size - in sectors.
+ */
+ bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
+ if (!bitmap->mddev->bitmap_info.external)
+ bytes += sizeof(bitmap_super_t);
+ space = DIV_ROUND_UP(bytes, 512);
+ bitmap->mddev->bitmap_info.space = space;
+ }
+ chunkshift = bitmap->counts.chunkshift;
+ chunkshift--;
+ do {
+ /* 'chunkshift' is shift from block size to chunk size */
+ chunkshift++;
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
+ bytes = DIV_ROUND_UP(chunks, 8);
+ if (!bitmap->mddev->bitmap_info.external)
+ bytes += sizeof(bitmap_super_t);
+ } while (bytes > (space << 9));
+ } else
+ chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
+
+ chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
+ memset(&store, 0, sizeof(store));
+ if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
+ ret = bitmap_storage_alloc(&store, chunks,
+ !bitmap->mddev->bitmap_info.external,
+ bitmap->cluster_slot);
+ if (ret)
+ goto err;
+
+ pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
+
+ new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!new_bp) {
+ bitmap_file_unmap(&store);
+ goto err;
+ }
+
+ if (!init)
+ bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
+
+ store.file = bitmap->storage.file;
+ bitmap->storage.file = NULL;
+
+ if (store.sb_page && bitmap->storage.sb_page)
+ memcpy(page_address(store.sb_page),
+ page_address(bitmap->storage.sb_page),
+ sizeof(bitmap_super_t));
+ bitmap_file_unmap(&bitmap->storage);
+ bitmap->storage = store;
+
+ old_counts = bitmap->counts;
+ bitmap->counts.bp = new_bp;
+ bitmap->counts.pages = pages;
+ bitmap->counts.missing_pages = pages;
+ bitmap->counts.chunkshift = chunkshift;
+ bitmap->counts.chunks = chunks;
+ bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
+ BITMAP_BLOCK_SHIFT);
+
+ blocks = min(old_counts.chunks << old_counts.chunkshift,
+ chunks << chunkshift);
+
+ spin_lock_irq(&bitmap->counts.lock);
+ for (block = 0; block < blocks; ) {
+ bitmap_counter_t *bmc_old, *bmc_new;
+ int set;
+
+ bmc_old = bitmap_get_counter(&old_counts, block,
+ &old_blocks, 0);
+ set = bmc_old && NEEDED(*bmc_old);
+
+ if (set) {
+ bmc_new = bitmap_get_counter(&bitmap->counts, block,
+ &new_blocks, 1);
+ if (*bmc_new == 0) {
+ /* need to set on-disk bits too. */
+ sector_t end = block + new_blocks;
+ sector_t start = block >> chunkshift;
+ start <<= chunkshift;
+ while (start < end) {
+ bitmap_file_set_bit(bitmap, block);
+ start += 1 << chunkshift;
+ }
+ *bmc_new = 2;
+ bitmap_count_page(&bitmap->counts,
+ block, 1);
+ bitmap_set_pending(&bitmap->counts,
+ block);
+ }
+ *bmc_new |= NEEDED_MASK;
+ if (new_blocks < old_blocks)
+ old_blocks = new_blocks;
+ }
+ block += old_blocks;
+ }
+
+ if (!init) {
+ int i;
+ while (block < (chunks << chunkshift)) {
+ bitmap_counter_t *bmc;
+ bmc = bitmap_get_counter(&bitmap->counts, block,
+ &new_blocks, 1);
+ if (bmc) {
+ /* new space. It needs to be resynced, so
+ * we set NEEDED_MASK.
+ */
+ if (*bmc == 0) {
+ *bmc = NEEDED_MASK | 2;
+ bitmap_count_page(&bitmap->counts,
+ block, 1);
+ bitmap_set_pending(&bitmap->counts,
+ block);
+ }
+ }
+ block += new_blocks;
+ }
+ for (i = 0; i < bitmap->storage.file_pages; i++)
+ set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+ }
+ spin_unlock_irq(&bitmap->counts.lock);
+
+ if (!init) {
+ bitmap_unplug(bitmap);
+ bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
+ }
+ ret = 0;
+err:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bitmap_resize);
+
+static ssize_t
+location_show(struct mddev *mddev, char *page)
+{
+ ssize_t len;
+ if (mddev->bitmap_info.file)
+ len = sprintf(page, "file");
+ else if (mddev->bitmap_info.offset)
+ len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset);
+ else
+ len = sprintf(page, "none");
+ len += sprintf(page+len, "\n");
+ return len;
+}
+
+static ssize_t
+location_store(struct mddev *mddev, const char *buf, size_t len)
+{
+
+ if (mddev->pers) {
+ if (!mddev->pers->quiesce)
+ return -EBUSY;
+ if (mddev->recovery || mddev->sync_thread)
+ return -EBUSY;
+ }
+
+ if (mddev->bitmap || mddev->bitmap_info.file ||
+ mddev->bitmap_info.offset) {
+ /* bitmap already configured. Only option is to clear it */
+ if (strncmp(buf, "none", 4) != 0)
+ return -EBUSY;
+ if (mddev->pers) {
+ mddev->pers->quiesce(mddev, 1);
+ bitmap_destroy(mddev);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ mddev->bitmap_info.offset = 0;
+ if (mddev->bitmap_info.file) {
+ struct file *f = mddev->bitmap_info.file;
+ mddev->bitmap_info.file = NULL;
+ fput(f);
+ }
+ } else {
+ /* No bitmap, OK to set a location */
+ long long offset;
+ if (strncmp(buf, "none", 4) == 0)
+ /* nothing to be done */;
+ else if (strncmp(buf, "file:", 5) == 0) {
+ /* Not supported yet */
+ return -EINVAL;
+ } else {
+ int rv;
+ if (buf[0] == '+')
+ rv = kstrtoll(buf+1, 10, &offset);
+ else
+ rv = kstrtoll(buf, 10, &offset);
+ if (rv)
+ return rv;
+ if (offset == 0)
+ return -EINVAL;
+ if (mddev->bitmap_info.external == 0 &&
+ mddev->major_version == 0 &&
+ offset != mddev->bitmap_info.default_offset)
+ return -EINVAL;
+ mddev->bitmap_info.offset = offset;
+ if (mddev->pers) {
+ struct bitmap *bitmap;
+ mddev->pers->quiesce(mddev, 1);
+ bitmap = bitmap_create(mddev, -1);
+ if (IS_ERR(bitmap))
+ rv = PTR_ERR(bitmap);
+ else {
+ mddev->bitmap = bitmap;
+ rv = bitmap_load(mddev);
+ if (rv) {
+ bitmap_destroy(mddev);
+ mddev->bitmap_info.offset = 0;
+ }
+ }
+ mddev->pers->quiesce(mddev, 0);
+ if (rv)
+ return rv;
+ }
+ }
+ }
+ if (!mddev->external) {
+ /* Ensure new bitmap info is stored in
+ * metadata promptly.
+ */
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ }
+ return len;
+}
+
+static struct md_sysfs_entry bitmap_location =
+__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
+
+/* 'bitmap/space' is the space available at 'location' for the
+ * bitmap. This allows the kernel to know when it is safe to
+ * resize the bitmap to match a resized array.
+ */
+static ssize_t
+space_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%lu\n", mddev->bitmap_info.space);
+}
+
+static ssize_t
+space_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned long sectors;
+ int rv;
+
+ rv = kstrtoul(buf, 10, &sectors);
+ if (rv)
+ return rv;
+
+ if (sectors == 0)
+ return -EINVAL;
+
+ if (mddev->bitmap &&
+ sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
+ return -EFBIG; /* Bitmap is too big for this small space */
+
+ /* could make sure it isn't too big, but that isn't really
+ * needed - user-space should be careful.
+ */
+ mddev->bitmap_info.space = sectors;
+ return len;
+}
+
+static struct md_sysfs_entry bitmap_space =
+__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
+
+static ssize_t
+timeout_show(struct mddev *mddev, char *page)
+{
+ ssize_t len;
+ unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
+ unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ;
+
+ len = sprintf(page, "%lu", secs);
+ if (jifs)
+ len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs));
+ len += sprintf(page+len, "\n");
+ return len;
+}
+
+static ssize_t
+timeout_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ /* timeout can be set at any time */
+ unsigned long timeout;
+ int rv = strict_strtoul_scaled(buf, &timeout, 4);
+ if (rv)
+ return rv;
+
+ /* just to make sure we don't overflow... */
+ if (timeout >= LONG_MAX / HZ)
+ return -EINVAL;
+
+ timeout = timeout * HZ / 10000;
+
+ if (timeout >= MAX_SCHEDULE_TIMEOUT)
+ timeout = MAX_SCHEDULE_TIMEOUT-1;
+ if (timeout < 1)
+ timeout = 1;
+ mddev->bitmap_info.daemon_sleep = timeout;
+ if (mddev->thread) {
+ /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then
+ * the bitmap is all clean and we don't need to
+ * adjust the timeout right now
+ */
+ if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) {
+ mddev->thread->timeout = timeout;
+ md_wakeup_thread(mddev->thread);
+ }
+ }
+ return len;
+}
+
+static struct md_sysfs_entry bitmap_timeout =
+__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
+
+static ssize_t
+backlog_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
+}
+
+static ssize_t
+backlog_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned long backlog;
+ int rv = kstrtoul(buf, 10, &backlog);
+ if (rv)
+ return rv;
+ if (backlog > COUNTER_MAX)
+ return -EINVAL;
+ mddev->bitmap_info.max_write_behind = backlog;
+ return len;
+}
+
+static struct md_sysfs_entry bitmap_backlog =
+__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
+
+static ssize_t
+chunksize_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
+}
+
+static ssize_t
+chunksize_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ /* Can only be changed when no bitmap is active */
+ int rv;
+ unsigned long csize;
+ if (mddev->bitmap)
+ return -EBUSY;
+ rv = kstrtoul(buf, 10, &csize);
+ if (rv)
+ return rv;
+ if (csize < 512 ||
+ !is_power_of_2(csize))
+ return -EINVAL;
+ mddev->bitmap_info.chunksize = csize;
+ return len;
+}
+
+static struct md_sysfs_entry bitmap_chunksize =
+__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
+
+static ssize_t metadata_show(struct mddev *mddev, char *page)
+{
+ if (mddev_is_clustered(mddev))
+ return sprintf(page, "clustered\n");
+ return sprintf(page, "%s\n", (mddev->bitmap_info.external
+ ? "external" : "internal"));
+}
+
+static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ if (mddev->bitmap ||
+ mddev->bitmap_info.file ||
+ mddev->bitmap_info.offset)
+ return -EBUSY;
+ if (strncmp(buf, "external", 8) == 0)
+ mddev->bitmap_info.external = 1;
+ else if ((strncmp(buf, "internal", 8) == 0) ||
+ (strncmp(buf, "clustered", 9) == 0))
+ mddev->bitmap_info.external = 0;
+ else
+ return -EINVAL;
+ return len;
+}
+
+static struct md_sysfs_entry bitmap_metadata =
+__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+
+static ssize_t can_clear_show(struct mddev *mddev, char *page)
+{
+ int len;
+ spin_lock(&mddev->lock);
+ if (mddev->bitmap)
+ len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
+ "false" : "true"));
+ else
+ len = sprintf(page, "\n");
+ spin_unlock(&mddev->lock);
+ return len;
+}
+
+static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ if (mddev->bitmap == NULL)
+ return -ENOENT;
+ if (strncmp(buf, "false", 5) == 0)
+ mddev->bitmap->need_sync = 1;
+ else if (strncmp(buf, "true", 4) == 0) {
+ if (mddev->degraded)
+ return -EBUSY;
+ mddev->bitmap->need_sync = 0;
+ } else
+ return -EINVAL;
+ return len;
+}
+
+static struct md_sysfs_entry bitmap_can_clear =
+__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
+
+static ssize_t
+behind_writes_used_show(struct mddev *mddev, char *page)
+{
+ ssize_t ret;
+ spin_lock(&mddev->lock);
+ if (mddev->bitmap == NULL)
+ ret = sprintf(page, "0\n");
+ else
+ ret = sprintf(page, "%lu\n",
+ mddev->bitmap->behind_writes_used);
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+static ssize_t
+behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
+{
+ if (mddev->bitmap)
+ mddev->bitmap->behind_writes_used = 0;
+ return len;
+}
+
+static struct md_sysfs_entry max_backlog_used =
+__ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
+ behind_writes_used_show, behind_writes_used_reset);
+
+static struct attribute *md_bitmap_attrs[] = {
+ &bitmap_location.attr,
+ &bitmap_space.attr,
+ &bitmap_timeout.attr,
+ &bitmap_backlog.attr,
+ &bitmap_chunksize.attr,
+ &bitmap_metadata.attr,
+ &bitmap_can_clear.attr,
+ &max_backlog_used.attr,
+ NULL
+};
+struct attribute_group md_bitmap_group = {
+ .name = "bitmap",
+ .attrs = md_bitmap_attrs,
+};
+
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
new file mode 100644
index 000000000..f1f4dd010
--- /dev/null
+++ b/drivers/md/bitmap.h
@@ -0,0 +1,269 @@
+/*
+ * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ */
+#ifndef BITMAP_H
+#define BITMAP_H 1
+
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ */
+#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_HOSTENDIAN 3
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed. The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync | counter |
+ * | needed | active | |
+ * | (0-1) | (0-1) | (0-16383) |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ * a '1' bit is read from storage at startup.
+ * a write request fails on some drives
+ * a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ * a resync is started on all drives, and resync_needed is set.
+ * resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared as well, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stipe is clean
+ * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ * page pointer (32-bit)
+ *
+ * [ ] ------+
+ * |
+ * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
+ * c1 c2 c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ * hijacked page pointer (32-bit)
+ *
+ * [ ][ ] (no page memory allocated)
+ * counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#ifdef __KERNEL__
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+typedef __u16 bitmap_counter_t;
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SHIFT 9
+
+#endif
+
+/*
+ * bitmap structures:
+ */
+
+#define BITMAP_MAGIC 0x6d746962
+
+/* use these for bitmap->flags and bitmap->sb->state bit-fields */
+enum bitmap_state {
+ BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
+ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
+ BITMAP_HOSTENDIAN =15,
+};
+
+/* the superblock at the front of the bitmap file -- little endian */
+typedef struct bitmap_super_s {
+ __le32 magic; /* 0 BITMAP_MAGIC */
+ __le32 version; /* 4 the bitmap major for now, could change... */
+ __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
+ __le64 events; /* 24 event counter for the bitmap (1)*/
+ __le64 events_cleared;/*32 event counter when last bit cleared (2) */
+ __le64 sync_size; /* 40 the size of the md device's sync range(3) */
+ __le32 state; /* 48 bitmap state information */
+ __le32 chunksize; /* 52 the bitmap chunk size in bytes */
+ __le32 daemon_sleep; /* 56 seconds between disk flushes */
+ __le32 write_behind; /* 60 number of outstanding write-behind writes */
+ __le32 sectors_reserved; /* 64 number of 512-byte sectors that are
+ * reserved for the bitmap. */
+ __le32 nodes; /* 68 the maximum number of nodes in cluster. */
+ __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
+ __u8 pad[256 - 136]; /* set to zero */
+} bitmap_super_t;
+
+/* notes:
+ * (1) This event counter is updated before the eventcounter in the md superblock
+ * When a bitmap is loaded, it is only accepted if this event counter is equal
+ * to, or one greater than, the event counter in the superblock.
+ * (2) This event counter is updated when the other one is *if*and*only*if* the
+ * array is not degraded. As bits are not cleared when the array is degraded,
+ * this represents the last time that any bits were cleared.
+ * If a device is being added that has an event count with this value or
+ * higher, it is accepted as conforming to the bitmap.
+ * (3)This is the number of sectors represented by the bitmap, and is the range that
+ * resync happens across. For raid1 and raid5/6 it is the size of individual
+ * devices. For raid10 it is the size of the array.
+ */
+
+#ifdef __KERNEL__
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+ /*
+ * map points to the actual memory page
+ */
+ char *map;
+ /*
+ * in emergencies (when map cannot be alloced), hijack the map
+ * pointer and use it as two counters itself
+ */
+ unsigned int hijacked:1;
+ /*
+ * If any counter in this page is '1' or '2' - and so could be
+ * cleared then that page is marked as 'pending'
+ */
+ unsigned int pending:1;
+ /*
+ * count of dirty bits on the page
+ */
+ unsigned int count:30;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+
+ struct bitmap_counts {
+ spinlock_t lock;
+ struct bitmap_page *bp;
+ unsigned long pages; /* total number of pages
+ * in the bitmap */
+ unsigned long missing_pages; /* number of pages
+ * not yet allocated */
+ unsigned long chunkshift; /* chunksize = 2^chunkshift
+ * (for bitops) */
+ unsigned long chunks; /* Total number of data
+ * chunks for the array */
+ } counts;
+
+ struct mddev *mddev; /* the md device that the bitmap is for */
+
+ __u64 events_cleared;
+ int need_sync;
+
+ struct bitmap_storage {
+ struct file *file; /* backing disk file */
+ struct page *sb_page; /* cached copy of the bitmap
+ * file superblock */
+ struct page **filemap; /* list of cache pages for
+ * the file */
+ unsigned long *filemap_attr; /* attributes associated
+ * w/ filemap pages */
+ unsigned long file_pages; /* number of pages in the file*/
+ unsigned long bytes; /* total bytes in the bitmap */
+ } storage;
+
+ unsigned long flags;
+
+ int allclean;
+
+ atomic_t behind_writes;
+ unsigned long behind_writes_used; /* highest actual value at runtime */
+
+ /*
+ * the bitmap daemon - periodically wakes up and sweeps the bitmap
+ * file, cleaning up bits and flushing out pages to disk as necessary
+ */
+ unsigned long daemon_lastrun; /* jiffies of last run */
+ unsigned long last_end_sync; /* when we lasted called end_sync to
+ * update bitmap with resync progress */
+
+ atomic_t pending_writes; /* pending writes to the bitmap file */
+ wait_queue_head_t write_wait;
+ wait_queue_head_t overflow_wait;
+ wait_queue_head_t behind_wait;
+
+ struct kernfs_node *sysfs_can_clear;
+ int cluster_slot; /* Slot offset for clustered env */
+};
+
+/* the bitmap API */
+
+/* these are used only by md/bitmap */
+struct bitmap *bitmap_create(struct mddev *mddev, int slot);
+int bitmap_load(struct mddev *mddev);
+void bitmap_flush(struct mddev *mddev);
+void bitmap_destroy(struct mddev *mddev);
+
+void bitmap_print_sb(struct bitmap *bitmap);
+void bitmap_update_sb(struct bitmap *bitmap);
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
+
+int bitmap_setallbits(struct bitmap *bitmap);
+void bitmap_write_all(struct bitmap *bitmap);
+
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
+
+/* these are exported */
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+ unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+ unsigned long sectors, int success, int behind);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
+void bitmap_close_sync(struct bitmap *bitmap);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+
+void bitmap_unplug(struct bitmap *bitmap);
+void bitmap_daemon_work(struct mddev *mddev);
+
+int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+ int chunksize, int init);
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+ sector_t *lo, sector_t *hi, bool clear_bits);
+#endif
+
+#endif
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
new file mode 100644
index 000000000..be065300e
--- /dev/null
+++ b/drivers/md/dm-bio-prison.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison.h"
+
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/*----------------------------------------------------------------*/
+
+#define MIN_CELLS 1024
+
+struct dm_bio_prison {
+ spinlock_t lock;
+ mempool_t *cell_pool;
+ struct rb_root cells;
+};
+
+static struct kmem_cache *_cell_cache;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * @nr_cells should be the number of cells you want in use _concurrently_.
+ * Don't confuse it with the number of distinct keys.
+ */
+struct dm_bio_prison *dm_bio_prison_create(void)
+{
+ struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+
+ if (!prison)
+ return NULL;
+
+ spin_lock_init(&prison->lock);
+
+ prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
+ if (!prison->cell_pool) {
+ kfree(prison);
+ return NULL;
+ }
+
+ prison->cells = RB_ROOT;
+
+ return prison;
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_create);
+
+void dm_bio_prison_destroy(struct dm_bio_prison *prison)
+{
+ mempool_destroy(prison->cell_pool);
+ kfree(prison);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
+
+struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp)
+{
+ return mempool_alloc(prison->cell_pool, gfp);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell);
+
+void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell)
+{
+ mempool_free(cell, prison->cell_pool);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
+
+static void __setup_new_cell(struct dm_cell_key *key,
+ struct bio *holder,
+ struct dm_bio_prison_cell *cell)
+{
+ memcpy(&cell->key, key, sizeof(cell->key));
+ cell->holder = holder;
+ bio_list_init(&cell->bios);
+}
+
+static int cmp_keys(struct dm_cell_key *lhs,
+ struct dm_cell_key *rhs)
+{
+ if (lhs->virtual < rhs->virtual)
+ return -1;
+
+ if (lhs->virtual > rhs->virtual)
+ return 1;
+
+ if (lhs->dev < rhs->dev)
+ return -1;
+
+ if (lhs->dev > rhs->dev)
+ return 1;
+
+ if (lhs->block_end <= rhs->block_begin)
+ return -1;
+
+ if (lhs->block_begin >= rhs->block_end)
+ return 1;
+
+ return 0;
+}
+
+static int __bio_detain(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *inmate,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+ while (*new) {
+ struct dm_bio_prison_cell *cell =
+ container_of(*new, struct dm_bio_prison_cell, node);
+
+ r = cmp_keys(key, &cell->key);
+
+ parent = *new;
+ if (r < 0)
+ new = &((*new)->rb_left);
+ else if (r > 0)
+ new = &((*new)->rb_right);
+ else {
+ if (inmate)
+ bio_list_add(&cell->bios, inmate);
+ *cell_result = cell;
+ return 1;
+ }
+ }
+
+ __setup_new_cell(key, inmate, cell_prealloc);
+ *cell_result = cell_prealloc;
+
+ rb_link_node(&cell_prealloc->node, parent, new);
+ rb_insert_color(&cell_prealloc->node, &prison->cells);
+
+ return 0;
+}
+
+static int bio_detain(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *inmate,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+
+int dm_bio_detain(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *inmate,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result)
+{
+ return bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+}
+EXPORT_SYMBOL_GPL(dm_bio_detain);
+
+int dm_get_cell(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result)
+{
+ return bio_detain(prison, key, NULL, cell_prealloc, cell_result);
+}
+EXPORT_SYMBOL_GPL(dm_get_cell);
+
+/*
+ * @inmates must have been initialised prior to this call
+ */
+static void __cell_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *inmates)
+{
+ rb_erase(&cell->node, &prison->cells);
+
+ if (inmates) {
+ if (cell->holder)
+ bio_list_add(inmates, cell->holder);
+ bio_list_merge(inmates, &cell->bios);
+ }
+}
+
+void dm_cell_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *bios)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release(prison, cell, bios);
+ spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_release);
+
+/*
+ * Sometimes we don't want the holder, just the additional bios.
+ */
+static void __cell_release_no_holder(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *inmates)
+{
+ rb_erase(&cell->node, &prison->cells);
+ bio_list_merge(inmates, &cell->bios);
+}
+
+void dm_cell_release_no_holder(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *inmates)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release_no_holder(prison, cell, inmates);
+ spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
+
+void dm_cell_error(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell, int error)
+{
+ struct bio_list bios;
+ struct bio *bio;
+
+ bio_list_init(&bios);
+ dm_cell_release(prison, cell, &bios);
+
+ while ((bio = bio_list_pop(&bios)))
+ bio_endio(bio, error);
+}
+EXPORT_SYMBOL_GPL(dm_cell_error);
+
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+ void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+ void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ visit_fn(context, cell);
+ rb_erase(&cell->node, &prison->cells);
+ spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_visit_release);
+
+/*----------------------------------------------------------------*/
+
+#define DEFERRED_SET_SIZE 64
+
+struct dm_deferred_entry {
+ struct dm_deferred_set *ds;
+ unsigned count;
+ struct list_head work_items;
+};
+
+struct dm_deferred_set {
+ spinlock_t lock;
+ unsigned current_entry;
+ unsigned sweeper;
+ struct dm_deferred_entry entries[DEFERRED_SET_SIZE];
+};
+
+struct dm_deferred_set *dm_deferred_set_create(void)
+{
+ int i;
+ struct dm_deferred_set *ds;
+
+ ds = kmalloc(sizeof(*ds), GFP_KERNEL);
+ if (!ds)
+ return NULL;
+
+ spin_lock_init(&ds->lock);
+ ds->current_entry = 0;
+ ds->sweeper = 0;
+ for (i = 0; i < DEFERRED_SET_SIZE; i++) {
+ ds->entries[i].ds = ds;
+ ds->entries[i].count = 0;
+ INIT_LIST_HEAD(&ds->entries[i].work_items);
+ }
+
+ return ds;
+}
+EXPORT_SYMBOL_GPL(dm_deferred_set_create);
+
+void dm_deferred_set_destroy(struct dm_deferred_set *ds)
+{
+ kfree(ds);
+}
+EXPORT_SYMBOL_GPL(dm_deferred_set_destroy);
+
+struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds)
+{
+ unsigned long flags;
+ struct dm_deferred_entry *entry;
+
+ spin_lock_irqsave(&ds->lock, flags);
+ entry = ds->entries + ds->current_entry;
+ entry->count++;
+ spin_unlock_irqrestore(&ds->lock, flags);
+
+ return entry;
+}
+EXPORT_SYMBOL_GPL(dm_deferred_entry_inc);
+
+static unsigned ds_next(unsigned index)
+{
+ return (index + 1) % DEFERRED_SET_SIZE;
+}
+
+static void __sweep(struct dm_deferred_set *ds, struct list_head *head)
+{
+ while ((ds->sweeper != ds->current_entry) &&
+ !ds->entries[ds->sweeper].count) {
+ list_splice_init(&ds->entries[ds->sweeper].work_items, head);
+ ds->sweeper = ds_next(ds->sweeper);
+ }
+
+ if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
+ list_splice_init(&ds->entries[ds->sweeper].work_items, head);
+}
+
+void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&entry->ds->lock, flags);
+ BUG_ON(!entry->count);
+ --entry->count;
+ __sweep(entry->ds, head);
+ spin_unlock_irqrestore(&entry->ds->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_deferred_entry_dec);
+
+/*
+ * Returns 1 if deferred or 0 if no pending items to delay job.
+ */
+int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
+{
+ int r = 1;
+ unsigned long flags;
+ unsigned next_entry;
+
+ spin_lock_irqsave(&ds->lock, flags);
+ if ((ds->sweeper == ds->current_entry) &&
+ !ds->entries[ds->current_entry].count)
+ r = 0;
+ else {
+ list_add(work, &ds->entries[ds->current_entry].work_items);
+ next_entry = ds_next(ds->current_entry);
+ if (!ds->entries[next_entry].count)
+ ds->current_entry = next_entry;
+ }
+ spin_unlock_irqrestore(&ds->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
+
+/*----------------------------------------------------------------*/
+
+static int __init dm_bio_prison_init(void)
+{
+ _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
+ if (!_cell_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit dm_bio_prison_exit(void)
+{
+ kmem_cache_destroy(_cell_cache);
+ _cell_cache = NULL;
+}
+
+/*
+ * module hooks
+ */
+module_init(dm_bio_prison_init);
+module_exit(dm_bio_prison_exit);
+
+MODULE_DESCRIPTION(DM_NAME " bio prison");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
new file mode 100644
index 000000000..74cf01144
--- /dev/null
+++ b/drivers/md/dm-bio-prison.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2011-2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BIO_PRISON_H
+#define DM_BIO_PRISON_H
+
+#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
+#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
+
+#include <linux/bio.h>
+#include <linux/rbtree.h>
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Sometimes we can't deal with a bio straight away. We put them in prison
+ * where they can't cause any mischief. Bios are put in a cell identified
+ * by a key, multiple bios can be in the same cell. When the cell is
+ * subsequently unlocked the bios become available.
+ */
+struct dm_bio_prison;
+
+/*
+ * Keys define a range of blocks within either a virtual or physical
+ * device.
+ */
+struct dm_cell_key {
+ int virtual;
+ dm_thin_id dev;
+ dm_block_t block_begin, block_end;
+};
+
+/*
+ * Treat this as opaque, only in header so callers can manage allocation
+ * themselves.
+ */
+struct dm_bio_prison_cell {
+ struct list_head user_list; /* for client use */
+ struct rb_node node;
+
+ struct dm_cell_key key;
+ struct bio *holder;
+ struct bio_list bios;
+};
+
+struct dm_bio_prison *dm_bio_prison_create(void);
+void dm_bio_prison_destroy(struct dm_bio_prison *prison);
+
+/*
+ * These two functions just wrap a mempool. This is a transitory step:
+ * Eventually all bio prison clients should manage their own cell memory.
+ *
+ * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called
+ * in interrupt context or passed GFP_NOWAIT.
+ */
+struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison,
+ gfp_t gfp);
+void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell);
+
+/*
+ * Creates, or retrieves a cell that overlaps the given key.
+ *
+ * Returns 1 if pre-existing cell returned, zero if new cell created using
+ * @cell_prealloc.
+ */
+int dm_get_cell(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result);
+
+/*
+ * An atomic op that combines retrieving or creating a cell, and adding a
+ * bio to it.
+ *
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
+ */
+int dm_bio_detain(struct dm_bio_prison *prison,
+ struct dm_cell_key *key,
+ struct bio *inmate,
+ struct dm_bio_prison_cell *cell_prealloc,
+ struct dm_bio_prison_cell **cell_result);
+
+void dm_cell_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *bios);
+void dm_cell_release_no_holder(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *inmates);
+void dm_cell_error(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell, int error);
+
+/*
+ * Visits the cell and then releases. Guarantees no new inmates are
+ * inserted between the visit and release.
+ */
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+ void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+ void *context, struct dm_bio_prison_cell *cell);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * We use the deferred set to keep track of pending reads to shared blocks.
+ * We do this to ensure the new mapping caused by a write isn't performed
+ * until these prior reads have completed. Otherwise the insertion of the
+ * new mapping could free the old block that the read bios are mapped to.
+ */
+
+struct dm_deferred_set;
+struct dm_deferred_entry;
+
+struct dm_deferred_set *dm_deferred_set_create(void);
+void dm_deferred_set_destroy(struct dm_deferred_set *ds);
+
+struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds);
+void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head);
+int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work);
+
+/*----------------------------------------------------------------*/
+
+#endif
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
new file mode 100644
index 000000000..dd3646111
--- /dev/null
+++ b/drivers/md/dm-bio-record.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BIO_RECORD_H
+#define DM_BIO_RECORD_H
+
+#include <linux/bio.h>
+
+/*
+ * There are lots of mutable fields in the bio struct that get
+ * changed by the lower levels of the block layer. Some targets,
+ * such as multipath, may wish to resubmit a bio on error. The
+ * functions in this file help the target record and restore the
+ * original bio state.
+ */
+
+struct dm_bio_details {
+ struct block_device *bi_bdev;
+ unsigned long bi_flags;
+ struct bvec_iter bi_iter;
+};
+
+static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
+{
+ bd->bi_bdev = bio->bi_bdev;
+ bd->bi_flags = bio->bi_flags;
+ bd->bi_iter = bio->bi_iter;
+}
+
+static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
+{
+ bio->bi_bdev = bd->bi_bdev;
+ bio->bi_flags = bd->bi_flags;
+ bio->bi_iter = bd->bi_iter;
+}
+
+#endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
new file mode 100644
index 000000000..86dbbc737
--- /dev/null
+++ b/drivers/md/dm-bufio.c
@@ -0,0 +1,1925 @@
+/*
+ * Copyright (C) 2009-2011 Red Hat, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-bufio.h"
+
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/vmalloc.h>
+#include <linux/shrinker.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+
+#define DM_MSG_PREFIX "bufio"
+
+/*
+ * Memory management policy:
+ * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
+ * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
+ * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
+ * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
+ * dirty buffers.
+ */
+#define DM_BUFIO_MIN_BUFFERS 8
+
+#define DM_BUFIO_MEMORY_PERCENT 2
+#define DM_BUFIO_VMALLOC_PERCENT 25
+#define DM_BUFIO_WRITEBACK_PERCENT 75
+
+/*
+ * Check buffer ages in this interval (seconds)
+ */
+#define DM_BUFIO_WORK_TIMER_SECS 30
+
+/*
+ * Free buffers when they are older than this (seconds)
+ */
+#define DM_BUFIO_DEFAULT_AGE_SECS 300
+
+/*
+ * The nr of bytes of cached data to keep around.
+ */
+#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
+
+/*
+ * The number of bvec entries that are embedded directly in the buffer.
+ * If the chunk size is larger, dm-io is used to do the io.
+ */
+#define DM_BUFIO_INLINE_VECS 16
+
+/*
+ * Don't try to use kmem_cache_alloc for blocks larger than this.
+ * For explanation, see alloc_buffer_data below.
+ */
+#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1)
+#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1))
+
+/*
+ * dm_buffer->list_mode
+ */
+#define LIST_CLEAN 0
+#define LIST_DIRTY 1
+#define LIST_SIZE 2
+
+/*
+ * Linking of buffers:
+ * All buffers are linked to cache_hash with their hash_list field.
+ *
+ * Clean buffers that are not being written (B_WRITING not set)
+ * are linked to lru[LIST_CLEAN] with their lru_list field.
+ *
+ * Dirty and clean buffers that are being written are linked to
+ * lru[LIST_DIRTY] with their lru_list field. When the write
+ * finishes, the buffer cannot be relinked immediately (because we
+ * are in an interrupt context and relinking requires process
+ * context), so some clean-not-writing buffers can be held on
+ * dirty_lru too. They are later added to lru in the process
+ * context.
+ */
+struct dm_bufio_client {
+ struct mutex lock;
+
+ struct list_head lru[LIST_SIZE];
+ unsigned long n_buffers[LIST_SIZE];
+
+ struct block_device *bdev;
+ unsigned block_size;
+ unsigned char sectors_per_block_bits;
+ unsigned char pages_per_block_bits;
+ unsigned char blocks_per_page_bits;
+ unsigned aux_size;
+ void (*alloc_callback)(struct dm_buffer *);
+ void (*write_callback)(struct dm_buffer *);
+
+ struct dm_io_client *dm_io;
+
+ struct list_head reserved_buffers;
+ unsigned need_reserved_buffers;
+
+ unsigned minimum_buffers;
+
+ struct rb_root buffer_tree;
+ wait_queue_head_t free_buffer_wait;
+
+ int async_write_error;
+
+ struct list_head client_list;
+ struct shrinker shrinker;
+};
+
+/*
+ * Buffer state bits.
+ */
+#define B_READING 0
+#define B_WRITING 1
+#define B_DIRTY 2
+
+/*
+ * Describes how the block was allocated:
+ * kmem_cache_alloc(), __get_free_pages() or vmalloc().
+ * See the comment at alloc_buffer_data.
+ */
+enum data_mode {
+ DATA_MODE_SLAB = 0,
+ DATA_MODE_GET_FREE_PAGES = 1,
+ DATA_MODE_VMALLOC = 2,
+ DATA_MODE_LIMIT = 3
+};
+
+struct dm_buffer {
+ struct rb_node node;
+ struct list_head lru_list;
+ sector_t block;
+ void *data;
+ enum data_mode data_mode;
+ unsigned char list_mode; /* LIST_* */
+ unsigned hold_count;
+ int read_error;
+ int write_error;
+ unsigned long state;
+ unsigned long last_accessed;
+ struct dm_bufio_client *c;
+ struct list_head write_list;
+ struct bio bio;
+ struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
+};
+
+/*----------------------------------------------------------------*/
+
+static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
+static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
+
+static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
+{
+ unsigned ret = c->blocks_per_page_bits - 1;
+
+ BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
+
+ return ret;
+}
+
+#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)])
+#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)])
+
+#define dm_bufio_in_request() (!!current->bio_list)
+
+static void dm_bufio_lock(struct dm_bufio_client *c)
+{
+ mutex_lock_nested(&c->lock, dm_bufio_in_request());
+}
+
+static int dm_bufio_trylock(struct dm_bufio_client *c)
+{
+ return mutex_trylock(&c->lock);
+}
+
+static void dm_bufio_unlock(struct dm_bufio_client *c)
+{
+ mutex_unlock(&c->lock);
+}
+
+/*
+ * FIXME Move to sched.h?
+ */
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+# define dm_bufio_cond_resched() \
+do { \
+ if (unlikely(need_resched())) \
+ _cond_resched(); \
+} while (0)
+#else
+# define dm_bufio_cond_resched() do { } while (0)
+#endif
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Default cache size: available memory divided by the ratio.
+ */
+static unsigned long dm_bufio_default_cache_size;
+
+/*
+ * Total cache size set by the user.
+ */
+static unsigned long dm_bufio_cache_size;
+
+/*
+ * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
+ * at any time. If it disagrees, the user has changed cache size.
+ */
+static unsigned long dm_bufio_cache_size_latch;
+
+static DEFINE_SPINLOCK(param_spinlock);
+
+/*
+ * Buffers are freed after this timeout
+ */
+static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
+
+static unsigned long dm_bufio_peak_allocated;
+static unsigned long dm_bufio_allocated_kmem_cache;
+static unsigned long dm_bufio_allocated_get_free_pages;
+static unsigned long dm_bufio_allocated_vmalloc;
+static unsigned long dm_bufio_current_allocated;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
+ */
+static unsigned long dm_bufio_cache_size_per_client;
+
+/*
+ * The current number of clients.
+ */
+static int dm_bufio_client_count;
+
+/*
+ * The list of all clients.
+ */
+static LIST_HEAD(dm_bufio_all_clients);
+
+/*
+ * This mutex protects dm_bufio_cache_size_latch,
+ * dm_bufio_cache_size_per_client and dm_bufio_client_count
+ */
+static DEFINE_MUTEX(dm_bufio_clients_lock);
+
+/*----------------------------------------------------------------
+ * A red/black tree acts as an index for all the buffers.
+ *--------------------------------------------------------------*/
+static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
+{
+ struct rb_node *n = c->buffer_tree.rb_node;
+ struct dm_buffer *b;
+
+ while (n) {
+ b = container_of(n, struct dm_buffer, node);
+
+ if (b->block == block)
+ return b;
+
+ n = (b->block < block) ? n->rb_left : n->rb_right;
+ }
+
+ return NULL;
+}
+
+static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+ struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
+ struct dm_buffer *found;
+
+ while (*new) {
+ found = container_of(*new, struct dm_buffer, node);
+
+ if (found->block == b->block) {
+ BUG_ON(found != b);
+ return;
+ }
+
+ parent = *new;
+ new = (found->block < b->block) ?
+ &((*new)->rb_left) : &((*new)->rb_right);
+ }
+
+ rb_link_node(&b->node, parent, new);
+ rb_insert_color(&b->node, &c->buffer_tree);
+}
+
+static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+ rb_erase(&b->node, &c->buffer_tree);
+}
+
+/*----------------------------------------------------------------*/
+
+static void adjust_total_allocated(enum data_mode data_mode, long diff)
+{
+ static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
+ &dm_bufio_allocated_kmem_cache,
+ &dm_bufio_allocated_get_free_pages,
+ &dm_bufio_allocated_vmalloc,
+ };
+
+ spin_lock(&param_spinlock);
+
+ *class_ptr[data_mode] += diff;
+
+ dm_bufio_current_allocated += diff;
+
+ if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
+ dm_bufio_peak_allocated = dm_bufio_current_allocated;
+
+ spin_unlock(&param_spinlock);
+}
+
+/*
+ * Change the number of clients and recalculate per-client limit.
+ */
+static void __cache_size_refresh(void)
+{
+ BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
+ BUG_ON(dm_bufio_client_count < 0);
+
+ dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
+
+ /*
+ * Use default if set to 0 and report the actual cache size used.
+ */
+ if (!dm_bufio_cache_size_latch) {
+ (void)cmpxchg(&dm_bufio_cache_size, 0,
+ dm_bufio_default_cache_size);
+ dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
+ }
+
+ dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
+ (dm_bufio_client_count ? : 1);
+}
+
+/*
+ * Allocating buffer data.
+ *
+ * Small buffers are allocated with kmem_cache, to use space optimally.
+ *
+ * For large buffers, we choose between get_free_pages and vmalloc.
+ * Each has advantages and disadvantages.
+ *
+ * __get_free_pages can randomly fail if the memory is fragmented.
+ * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
+ * as low as 128M) so using it for caching is not appropriate.
+ *
+ * If the allocation may fail we use __get_free_pages. Memory fragmentation
+ * won't have a fatal effect here, but it just causes flushes of some other
+ * buffers and more I/O will be performed. Don't use __get_free_pages if it
+ * always fails (i.e. order >= MAX_ORDER).
+ *
+ * If the allocation shouldn't fail we use __vmalloc. This is only for the
+ * initial reserve allocation, so there's no risk of wasting all vmalloc
+ * space.
+ */
+static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
+ enum data_mode *data_mode)
+{
+ unsigned noio_flag;
+ void *ptr;
+
+ if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
+ *data_mode = DATA_MODE_SLAB;
+ return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
+ }
+
+ if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
+ gfp_mask & __GFP_NORETRY) {
+ *data_mode = DATA_MODE_GET_FREE_PAGES;
+ return (void *)__get_free_pages(gfp_mask,
+ c->pages_per_block_bits);
+ }
+
+ *data_mode = DATA_MODE_VMALLOC;
+
+ /*
+ * __vmalloc allocates the data pages and auxiliary structures with
+ * gfp_flags that were specified, but pagetables are always allocated
+ * with GFP_KERNEL, no matter what was specified as gfp_mask.
+ *
+ * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
+ * all allocations done by this process (including pagetables) are done
+ * as if GFP_NOIO was specified.
+ */
+
+ if (gfp_mask & __GFP_NORETRY)
+ noio_flag = memalloc_noio_save();
+
+ ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL);
+
+ if (gfp_mask & __GFP_NORETRY)
+ memalloc_noio_restore(noio_flag);
+
+ return ptr;
+}
+
+/*
+ * Free buffer's data.
+ */
+static void free_buffer_data(struct dm_bufio_client *c,
+ void *data, enum data_mode data_mode)
+{
+ switch (data_mode) {
+ case DATA_MODE_SLAB:
+ kmem_cache_free(DM_BUFIO_CACHE(c), data);
+ break;
+
+ case DATA_MODE_GET_FREE_PAGES:
+ free_pages((unsigned long)data, c->pages_per_block_bits);
+ break;
+
+ case DATA_MODE_VMALLOC:
+ vfree(data);
+ break;
+
+ default:
+ DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
+ data_mode);
+ BUG();
+ }
+}
+
+/*
+ * Allocate buffer and its data.
+ */
+static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
+{
+ struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
+ gfp_mask);
+
+ if (!b)
+ return NULL;
+
+ b->c = c;
+
+ b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
+ if (!b->data) {
+ kfree(b);
+ return NULL;
+ }
+
+ adjust_total_allocated(b->data_mode, (long)c->block_size);
+
+ return b;
+}
+
+/*
+ * Free buffer and its data.
+ */
+static void free_buffer(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+
+ adjust_total_allocated(b->data_mode, -(long)c->block_size);
+
+ free_buffer_data(c, b->data, b->data_mode);
+ kfree(b);
+}
+
+/*
+ * Link buffer to the hash list and clean or dirty queue.
+ */
+static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
+{
+ struct dm_bufio_client *c = b->c;
+
+ c->n_buffers[dirty]++;
+ b->block = block;
+ b->list_mode = dirty;
+ list_add(&b->lru_list, &c->lru[dirty]);
+ __insert(b->c, b);
+ b->last_accessed = jiffies;
+}
+
+/*
+ * Unlink buffer from the hash list and dirty or clean queue.
+ */
+static void __unlink_buffer(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+
+ BUG_ON(!c->n_buffers[b->list_mode]);
+
+ c->n_buffers[b->list_mode]--;
+ __remove(b->c, b);
+ list_del(&b->lru_list);
+}
+
+/*
+ * Place the buffer to the head of dirty or clean LRU queue.
+ */
+static void __relink_lru(struct dm_buffer *b, int dirty)
+{
+ struct dm_bufio_client *c = b->c;
+
+ BUG_ON(!c->n_buffers[b->list_mode]);
+
+ c->n_buffers[b->list_mode]--;
+ c->n_buffers[dirty]++;
+ b->list_mode = dirty;
+ list_move(&b->lru_list, &c->lru[dirty]);
+ b->last_accessed = jiffies;
+}
+
+/*----------------------------------------------------------------
+ * Submit I/O on the buffer.
+ *
+ * Bio interface is faster but it has some problems:
+ * the vector list is limited (increasing this limit increases
+ * memory-consumption per buffer, so it is not viable);
+ *
+ * the memory must be direct-mapped, not vmalloced;
+ *
+ * the I/O driver can reject requests spuriously if it thinks that
+ * the requests are too big for the device or if they cross a
+ * controller-defined memory boundary.
+ *
+ * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
+ * it is not vmalloced, try using the bio interface.
+ *
+ * If the buffer is big, if it is vmalloced or if the underlying device
+ * rejects the bio because it is too large, use dm-io layer to do the I/O.
+ * The dm-io layer splits the I/O into multiple requests, avoiding the above
+ * shortcomings.
+ *--------------------------------------------------------------*/
+
+/*
+ * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
+ * that the request was handled directly with bio interface.
+ */
+static void dmio_complete(unsigned long error, void *context)
+{
+ struct dm_buffer *b = context;
+
+ b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
+}
+
+static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
+ bio_end_io_t *end_io)
+{
+ int r;
+ struct dm_io_request io_req = {
+ .bi_rw = rw,
+ .notify.fn = dmio_complete,
+ .notify.context = b,
+ .client = b->c->dm_io,
+ };
+ struct dm_io_region region = {
+ .bdev = b->c->bdev,
+ .sector = block << b->c->sectors_per_block_bits,
+ .count = b->c->block_size >> SECTOR_SHIFT,
+ };
+
+ if (b->data_mode != DATA_MODE_VMALLOC) {
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = b->data;
+ } else {
+ io_req.mem.type = DM_IO_VMA;
+ io_req.mem.ptr.vma = b->data;
+ }
+
+ b->bio.bi_end_io = end_io;
+
+ r = dm_io(&io_req, 1, &region, NULL);
+ if (r)
+ end_io(&b->bio, r);
+}
+
+static void inline_endio(struct bio *bio, int error)
+{
+ bio_end_io_t *end_fn = bio->bi_private;
+
+ /*
+ * Reset the bio to free any attached resources
+ * (e.g. bio integrity profiles).
+ */
+ bio_reset(bio);
+
+ end_fn(bio, error);
+}
+
+static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
+ bio_end_io_t *end_io)
+{
+ char *ptr;
+ int len;
+
+ bio_init(&b->bio);
+ b->bio.bi_io_vec = b->bio_vec;
+ b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
+ b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
+ b->bio.bi_bdev = b->c->bdev;
+ b->bio.bi_end_io = inline_endio;
+ /*
+ * Use of .bi_private isn't a problem here because
+ * the dm_buffer's inline bio is local to bufio.
+ */
+ b->bio.bi_private = end_io;
+
+ /*
+ * We assume that if len >= PAGE_SIZE ptr is page-aligned.
+ * If len < PAGE_SIZE the buffer doesn't cross page boundary.
+ */
+ ptr = b->data;
+ len = b->c->block_size;
+
+ if (len >= PAGE_SIZE)
+ BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
+ else
+ BUG_ON((unsigned long)ptr & (len - 1));
+
+ do {
+ if (!bio_add_page(&b->bio, virt_to_page(ptr),
+ len < PAGE_SIZE ? len : PAGE_SIZE,
+ virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
+ BUG_ON(b->c->block_size <= PAGE_SIZE);
+ use_dmio(b, rw, block, end_io);
+ return;
+ }
+
+ len -= PAGE_SIZE;
+ ptr += PAGE_SIZE;
+ } while (len > 0);
+
+ submit_bio(rw, &b->bio);
+}
+
+static void submit_io(struct dm_buffer *b, int rw, sector_t block,
+ bio_end_io_t *end_io)
+{
+ if (rw == WRITE && b->c->write_callback)
+ b->c->write_callback(b);
+
+ if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
+ b->data_mode != DATA_MODE_VMALLOC)
+ use_inline_bio(b, rw, block, end_io);
+ else
+ use_dmio(b, rw, block, end_io);
+}
+
+/*----------------------------------------------------------------
+ * Writing dirty buffers
+ *--------------------------------------------------------------*/
+
+/*
+ * The endio routine for write.
+ *
+ * Set the error, clear B_WRITING bit and wake anyone who was waiting on
+ * it.
+ */
+static void write_endio(struct bio *bio, int error)
+{
+ struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
+
+ b->write_error = error;
+ if (unlikely(error)) {
+ struct dm_bufio_client *c = b->c;
+ (void)cmpxchg(&c->async_write_error, 0, error);
+ }
+
+ BUG_ON(!test_bit(B_WRITING, &b->state));
+
+ smp_mb__before_atomic();
+ clear_bit(B_WRITING, &b->state);
+ smp_mb__after_atomic();
+
+ wake_up_bit(&b->state, B_WRITING);
+}
+
+/*
+ * Initiate a write on a dirty buffer, but don't wait for it.
+ *
+ * - If the buffer is not dirty, exit.
+ * - If there some previous write going on, wait for it to finish (we can't
+ * have two writes on the same buffer simultaneously).
+ * - Submit our write and don't wait on it. We set B_WRITING indicating
+ * that there is a write in progress.
+ */
+static void __write_dirty_buffer(struct dm_buffer *b,
+ struct list_head *write_list)
+{
+ if (!test_bit(B_DIRTY, &b->state))
+ return;
+
+ clear_bit(B_DIRTY, &b->state);
+ wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
+
+ if (!write_list)
+ submit_io(b, WRITE, b->block, write_endio);
+ else
+ list_add_tail(&b->write_list, write_list);
+}
+
+static void __flush_write_list(struct list_head *write_list)
+{
+ struct blk_plug plug;
+ blk_start_plug(&plug);
+ while (!list_empty(write_list)) {
+ struct dm_buffer *b =
+ list_entry(write_list->next, struct dm_buffer, write_list);
+ list_del(&b->write_list);
+ submit_io(b, WRITE, b->block, write_endio);
+ dm_bufio_cond_resched();
+ }
+ blk_finish_plug(&plug);
+}
+
+/*
+ * Wait until any activity on the buffer finishes. Possibly write the
+ * buffer if it is dirty. When this function finishes, there is no I/O
+ * running on the buffer and the buffer is not dirty.
+ */
+static void __make_buffer_clean(struct dm_buffer *b)
+{
+ BUG_ON(b->hold_count);
+
+ if (!b->state) /* fast case */
+ return;
+
+ wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
+ __write_dirty_buffer(b, NULL);
+ wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Find some buffer that is not held by anybody, clean it, unlink it and
+ * return it.
+ */
+static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b;
+
+ list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
+ BUG_ON(test_bit(B_WRITING, &b->state));
+ BUG_ON(test_bit(B_DIRTY, &b->state));
+
+ if (!b->hold_count) {
+ __make_buffer_clean(b);
+ __unlink_buffer(b);
+ return b;
+ }
+ dm_bufio_cond_resched();
+ }
+
+ list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
+ BUG_ON(test_bit(B_READING, &b->state));
+
+ if (!b->hold_count) {
+ __make_buffer_clean(b);
+ __unlink_buffer(b);
+ return b;
+ }
+ dm_bufio_cond_resched();
+ }
+
+ return NULL;
+}
+
+/*
+ * Wait until some other threads free some buffer or release hold count on
+ * some buffer.
+ *
+ * This function is entered with c->lock held, drops it and regains it
+ * before exiting.
+ */
+static void __wait_for_free_buffer(struct dm_bufio_client *c)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&c->free_buffer_wait, &wait);
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ dm_bufio_unlock(c);
+
+ io_schedule();
+
+ remove_wait_queue(&c->free_buffer_wait, &wait);
+
+ dm_bufio_lock(c);
+}
+
+enum new_flag {
+ NF_FRESH = 0,
+ NF_READ = 1,
+ NF_GET = 2,
+ NF_PREFETCH = 3
+};
+
+/*
+ * Allocate a new buffer. If the allocation is not possible, wait until
+ * some other thread frees a buffer.
+ *
+ * May drop the lock and regain it.
+ */
+static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
+{
+ struct dm_buffer *b;
+
+ /*
+ * dm-bufio is resistant to allocation failures (it just keeps
+ * one buffer reserved in cases all the allocations fail).
+ * So set flags to not try too hard:
+ * GFP_NOIO: don't recurse into the I/O layer
+ * __GFP_NORETRY: don't retry and rather return failure
+ * __GFP_NOMEMALLOC: don't use emergency reserves
+ * __GFP_NOWARN: don't print a warning in case of failure
+ *
+ * For debugging, if we set the cache size to 1, no new buffers will
+ * be allocated.
+ */
+ while (1) {
+ if (dm_bufio_cache_size_latch != 1) {
+ b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (b)
+ return b;
+ }
+
+ if (nf == NF_PREFETCH)
+ return NULL;
+
+ if (!list_empty(&c->reserved_buffers)) {
+ b = list_entry(c->reserved_buffers.next,
+ struct dm_buffer, lru_list);
+ list_del(&b->lru_list);
+ c->need_reserved_buffers++;
+
+ return b;
+ }
+
+ b = __get_unclaimed_buffer(c);
+ if (b)
+ return b;
+
+ __wait_for_free_buffer(c);
+ }
+}
+
+static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
+{
+ struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
+
+ if (!b)
+ return NULL;
+
+ if (c->alloc_callback)
+ c->alloc_callback(b);
+
+ return b;
+}
+
+/*
+ * Free a buffer and wake other threads waiting for free buffers.
+ */
+static void __free_buffer_wake(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+
+ if (!c->need_reserved_buffers)
+ free_buffer(b);
+ else {
+ list_add(&b->lru_list, &c->reserved_buffers);
+ c->need_reserved_buffers--;
+ }
+
+ wake_up(&c->free_buffer_wait);
+}
+
+static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
+ struct list_head *write_list)
+{
+ struct dm_buffer *b, *tmp;
+
+ list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
+ BUG_ON(test_bit(B_READING, &b->state));
+
+ if (!test_bit(B_DIRTY, &b->state) &&
+ !test_bit(B_WRITING, &b->state)) {
+ __relink_lru(b, LIST_CLEAN);
+ continue;
+ }
+
+ if (no_wait && test_bit(B_WRITING, &b->state))
+ return;
+
+ __write_dirty_buffer(b, write_list);
+ dm_bufio_cond_resched();
+ }
+}
+
+/*
+ * Get writeback threshold and buffer limit for a given client.
+ */
+static void __get_memory_limit(struct dm_bufio_client *c,
+ unsigned long *threshold_buffers,
+ unsigned long *limit_buffers)
+{
+ unsigned long buffers;
+
+ if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
+ mutex_lock(&dm_bufio_clients_lock);
+ __cache_size_refresh();
+ mutex_unlock(&dm_bufio_clients_lock);
+ }
+
+ buffers = dm_bufio_cache_size_per_client >>
+ (c->sectors_per_block_bits + SECTOR_SHIFT);
+
+ if (buffers < c->minimum_buffers)
+ buffers = c->minimum_buffers;
+
+ *limit_buffers = buffers;
+ *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
+}
+
+/*
+ * Check if we're over watermark.
+ * If we are over threshold_buffers, start freeing buffers.
+ * If we're over "limit_buffers", block until we get under the limit.
+ */
+static void __check_watermark(struct dm_bufio_client *c,
+ struct list_head *write_list)
+{
+ unsigned long threshold_buffers, limit_buffers;
+
+ __get_memory_limit(c, &threshold_buffers, &limit_buffers);
+
+ while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
+ limit_buffers) {
+
+ struct dm_buffer *b = __get_unclaimed_buffer(c);
+
+ if (!b)
+ return;
+
+ __free_buffer_wake(b);
+ dm_bufio_cond_resched();
+ }
+
+ if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
+ __write_dirty_buffers_async(c, 1, write_list);
+}
+
+/*----------------------------------------------------------------
+ * Getting a buffer
+ *--------------------------------------------------------------*/
+
+static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
+ enum new_flag nf, int *need_submit,
+ struct list_head *write_list)
+{
+ struct dm_buffer *b, *new_b = NULL;
+
+ *need_submit = 0;
+
+ b = __find(c, block);
+ if (b)
+ goto found_buffer;
+
+ if (nf == NF_GET)
+ return NULL;
+
+ new_b = __alloc_buffer_wait(c, nf);
+ if (!new_b)
+ return NULL;
+
+ /*
+ * We've had a period where the mutex was unlocked, so need to
+ * recheck the hash table.
+ */
+ b = __find(c, block);
+ if (b) {
+ __free_buffer_wake(new_b);
+ goto found_buffer;
+ }
+
+ __check_watermark(c, write_list);
+
+ b = new_b;
+ b->hold_count = 1;
+ b->read_error = 0;
+ b->write_error = 0;
+ __link_buffer(b, block, LIST_CLEAN);
+
+ if (nf == NF_FRESH) {
+ b->state = 0;
+ return b;
+ }
+
+ b->state = 1 << B_READING;
+ *need_submit = 1;
+
+ return b;
+
+found_buffer:
+ if (nf == NF_PREFETCH)
+ return NULL;
+ /*
+ * Note: it is essential that we don't wait for the buffer to be
+ * read if dm_bufio_get function is used. Both dm_bufio_get and
+ * dm_bufio_prefetch can be used in the driver request routine.
+ * If the user called both dm_bufio_prefetch and dm_bufio_get on
+ * the same buffer, it would deadlock if we waited.
+ */
+ if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
+ return NULL;
+
+ b->hold_count++;
+ __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
+ test_bit(B_WRITING, &b->state));
+ return b;
+}
+
+/*
+ * The endio routine for reading: set the error, clear the bit and wake up
+ * anyone waiting on the buffer.
+ */
+static void read_endio(struct bio *bio, int error)
+{
+ struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
+
+ b->read_error = error;
+
+ BUG_ON(!test_bit(B_READING, &b->state));
+
+ smp_mb__before_atomic();
+ clear_bit(B_READING, &b->state);
+ smp_mb__after_atomic();
+
+ wake_up_bit(&b->state, B_READING);
+}
+
+/*
+ * A common routine for dm_bufio_new and dm_bufio_read. Operation of these
+ * functions is similar except that dm_bufio_new doesn't read the
+ * buffer from the disk (assuming that the caller overwrites all the data
+ * and uses dm_bufio_mark_buffer_dirty to write new data back).
+ */
+static void *new_read(struct dm_bufio_client *c, sector_t block,
+ enum new_flag nf, struct dm_buffer **bp)
+{
+ int need_submit;
+ struct dm_buffer *b;
+
+ LIST_HEAD(write_list);
+
+ dm_bufio_lock(c);
+ b = __bufio_new(c, block, nf, &need_submit, &write_list);
+ dm_bufio_unlock(c);
+
+ __flush_write_list(&write_list);
+
+ if (!b)
+ return b;
+
+ if (need_submit)
+ submit_io(b, READ, b->block, read_endio);
+
+ wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
+
+ if (b->read_error) {
+ int error = b->read_error;
+
+ dm_bufio_release(b);
+
+ return ERR_PTR(error);
+ }
+
+ *bp = b;
+
+ return b->data;
+}
+
+void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
+ struct dm_buffer **bp)
+{
+ return new_read(c, block, NF_GET, bp);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get);
+
+void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
+ struct dm_buffer **bp)
+{
+ BUG_ON(dm_bufio_in_request());
+
+ return new_read(c, block, NF_READ, bp);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_read);
+
+void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
+ struct dm_buffer **bp)
+{
+ BUG_ON(dm_bufio_in_request());
+
+ return new_read(c, block, NF_FRESH, bp);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_new);
+
+void dm_bufio_prefetch(struct dm_bufio_client *c,
+ sector_t block, unsigned n_blocks)
+{
+ struct blk_plug plug;
+
+ LIST_HEAD(write_list);
+
+ BUG_ON(dm_bufio_in_request());
+
+ blk_start_plug(&plug);
+ dm_bufio_lock(c);
+
+ for (; n_blocks--; block++) {
+ int need_submit;
+ struct dm_buffer *b;
+ b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
+ &write_list);
+ if (unlikely(!list_empty(&write_list))) {
+ dm_bufio_unlock(c);
+ blk_finish_plug(&plug);
+ __flush_write_list(&write_list);
+ blk_start_plug(&plug);
+ dm_bufio_lock(c);
+ }
+ if (unlikely(b != NULL)) {
+ dm_bufio_unlock(c);
+
+ if (need_submit)
+ submit_io(b, READ, b->block, read_endio);
+ dm_bufio_release(b);
+
+ dm_bufio_cond_resched();
+
+ if (!n_blocks)
+ goto flush_plug;
+ dm_bufio_lock(c);
+ }
+ }
+
+ dm_bufio_unlock(c);
+
+flush_plug:
+ blk_finish_plug(&plug);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
+
+void dm_bufio_release(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+
+ dm_bufio_lock(c);
+
+ BUG_ON(!b->hold_count);
+
+ b->hold_count--;
+ if (!b->hold_count) {
+ wake_up(&c->free_buffer_wait);
+
+ /*
+ * If there were errors on the buffer, and the buffer is not
+ * to be written, free the buffer. There is no point in caching
+ * invalid buffer.
+ */
+ if ((b->read_error || b->write_error) &&
+ !test_bit(B_READING, &b->state) &&
+ !test_bit(B_WRITING, &b->state) &&
+ !test_bit(B_DIRTY, &b->state)) {
+ __unlink_buffer(b);
+ __free_buffer_wake(b);
+ }
+ }
+
+ dm_bufio_unlock(c);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_release);
+
+void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+
+ dm_bufio_lock(c);
+
+ BUG_ON(test_bit(B_READING, &b->state));
+
+ if (!test_and_set_bit(B_DIRTY, &b->state))
+ __relink_lru(b, LIST_DIRTY);
+
+ dm_bufio_unlock(c);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
+
+void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
+{
+ LIST_HEAD(write_list);
+
+ BUG_ON(dm_bufio_in_request());
+
+ dm_bufio_lock(c);
+ __write_dirty_buffers_async(c, 0, &write_list);
+ dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
+
+/*
+ * For performance, it is essential that the buffers are written asynchronously
+ * and simultaneously (so that the block layer can merge the writes) and then
+ * waited upon.
+ *
+ * Finally, we flush hardware disk cache.
+ */
+int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
+{
+ int a, f;
+ unsigned long buffers_processed = 0;
+ struct dm_buffer *b, *tmp;
+
+ LIST_HEAD(write_list);
+
+ dm_bufio_lock(c);
+ __write_dirty_buffers_async(c, 0, &write_list);
+ dm_bufio_unlock(c);
+ __flush_write_list(&write_list);
+ dm_bufio_lock(c);
+
+again:
+ list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
+ int dropped_lock = 0;
+
+ if (buffers_processed < c->n_buffers[LIST_DIRTY])
+ buffers_processed++;
+
+ BUG_ON(test_bit(B_READING, &b->state));
+
+ if (test_bit(B_WRITING, &b->state)) {
+ if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
+ dropped_lock = 1;
+ b->hold_count++;
+ dm_bufio_unlock(c);
+ wait_on_bit_io(&b->state, B_WRITING,
+ TASK_UNINTERRUPTIBLE);
+ dm_bufio_lock(c);
+ b->hold_count--;
+ } else
+ wait_on_bit_io(&b->state, B_WRITING,
+ TASK_UNINTERRUPTIBLE);
+ }
+
+ if (!test_bit(B_DIRTY, &b->state) &&
+ !test_bit(B_WRITING, &b->state))
+ __relink_lru(b, LIST_CLEAN);
+
+ dm_bufio_cond_resched();
+
+ /*
+ * If we dropped the lock, the list is no longer consistent,
+ * so we must restart the search.
+ *
+ * In the most common case, the buffer just processed is
+ * relinked to the clean list, so we won't loop scanning the
+ * same buffer again and again.
+ *
+ * This may livelock if there is another thread simultaneously
+ * dirtying buffers, so we count the number of buffers walked
+ * and if it exceeds the total number of buffers, it means that
+ * someone is doing some writes simultaneously with us. In
+ * this case, stop, dropping the lock.
+ */
+ if (dropped_lock)
+ goto again;
+ }
+ wake_up(&c->free_buffer_wait);
+ dm_bufio_unlock(c);
+
+ a = xchg(&c->async_write_error, 0);
+ f = dm_bufio_issue_flush(c);
+ if (a)
+ return a;
+
+ return f;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
+
+/*
+ * Use dm-io to send and empty barrier flush the device.
+ */
+int dm_bufio_issue_flush(struct dm_bufio_client *c)
+{
+ struct dm_io_request io_req = {
+ .bi_rw = WRITE_FLUSH,
+ .mem.type = DM_IO_KMEM,
+ .mem.ptr.addr = NULL,
+ .client = c->dm_io,
+ };
+ struct dm_io_region io_reg = {
+ .bdev = c->bdev,
+ .sector = 0,
+ .count = 0,
+ };
+
+ BUG_ON(dm_bufio_in_request());
+
+ return dm_io(&io_req, 1, &io_reg, NULL);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
+
+/*
+ * We first delete any other buffer that may be at that new location.
+ *
+ * Then, we write the buffer to the original location if it was dirty.
+ *
+ * Then, if we are the only one who is holding the buffer, relink the buffer
+ * in the hash queue for the new location.
+ *
+ * If there was someone else holding the buffer, we write it to the new
+ * location but not relink it, because that other user needs to have the buffer
+ * at the same place.
+ */
+void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
+{
+ struct dm_bufio_client *c = b->c;
+ struct dm_buffer *new;
+
+ BUG_ON(dm_bufio_in_request());
+
+ dm_bufio_lock(c);
+
+retry:
+ new = __find(c, new_block);
+ if (new) {
+ if (new->hold_count) {
+ __wait_for_free_buffer(c);
+ goto retry;
+ }
+
+ /*
+ * FIXME: Is there any point waiting for a write that's going
+ * to be overwritten in a bit?
+ */
+ __make_buffer_clean(new);
+ __unlink_buffer(new);
+ __free_buffer_wake(new);
+ }
+
+ BUG_ON(!b->hold_count);
+ BUG_ON(test_bit(B_READING, &b->state));
+
+ __write_dirty_buffer(b, NULL);
+ if (b->hold_count == 1) {
+ wait_on_bit_io(&b->state, B_WRITING,
+ TASK_UNINTERRUPTIBLE);
+ set_bit(B_DIRTY, &b->state);
+ __unlink_buffer(b);
+ __link_buffer(b, new_block, LIST_DIRTY);
+ } else {
+ sector_t old_block;
+ wait_on_bit_lock_io(&b->state, B_WRITING,
+ TASK_UNINTERRUPTIBLE);
+ /*
+ * Relink buffer to "new_block" so that write_callback
+ * sees "new_block" as a block number.
+ * After the write, link the buffer back to old_block.
+ * All this must be done in bufio lock, so that block number
+ * change isn't visible to other threads.
+ */
+ old_block = b->block;
+ __unlink_buffer(b);
+ __link_buffer(b, new_block, b->list_mode);
+ submit_io(b, WRITE, new_block, write_endio);
+ wait_on_bit_io(&b->state, B_WRITING,
+ TASK_UNINTERRUPTIBLE);
+ __unlink_buffer(b);
+ __link_buffer(b, old_block, b->list_mode);
+ }
+
+ dm_bufio_unlock(c);
+ dm_bufio_release(b);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_release_move);
+
+/*
+ * Free the given buffer.
+ *
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
+{
+ struct dm_buffer *b;
+
+ dm_bufio_lock(c);
+
+ b = __find(c, block);
+ if (b && likely(!b->hold_count) && likely(!b->state)) {
+ __unlink_buffer(b);
+ __free_buffer_wake(b);
+ }
+
+ dm_bufio_unlock(c);
+}
+EXPORT_SYMBOL(dm_bufio_forget);
+
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
+{
+ c->minimum_buffers = n;
+}
+EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
+
+unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
+{
+ return c->block_size;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
+
+sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
+{
+ return i_size_read(c->bdev->bd_inode) >>
+ (SECTOR_SHIFT + c->sectors_per_block_bits);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
+
+sector_t dm_bufio_get_block_number(struct dm_buffer *b)
+{
+ return b->block;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
+
+void *dm_bufio_get_block_data(struct dm_buffer *b)
+{
+ return b->data;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
+
+void *dm_bufio_get_aux_data(struct dm_buffer *b)
+{
+ return b + 1;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
+
+struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
+{
+ return b->c;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get_client);
+
+static void drop_buffers(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b;
+ int i;
+
+ BUG_ON(dm_bufio_in_request());
+
+ /*
+ * An optimization so that the buffers are not written one-by-one.
+ */
+ dm_bufio_write_dirty_buffers_async(c);
+
+ dm_bufio_lock(c);
+
+ while ((b = __get_unclaimed_buffer(c)))
+ __free_buffer_wake(b);
+
+ for (i = 0; i < LIST_SIZE; i++)
+ list_for_each_entry(b, &c->lru[i], lru_list)
+ DMERR("leaked buffer %llx, hold count %u, list %d",
+ (unsigned long long)b->block, b->hold_count, i);
+
+ for (i = 0; i < LIST_SIZE; i++)
+ BUG_ON(!list_empty(&c->lru[i]));
+
+ dm_bufio_unlock(c);
+}
+
+/*
+ * We may not be able to evict this buffer if IO pending or the client
+ * is still using it. Caller is expected to know buffer is too old.
+ *
+ * And if GFP_NOFS is used, we must not do any I/O because we hold
+ * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
+ * rerouted to different bufio client.
+ */
+static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
+{
+ if (!(gfp & __GFP_FS)) {
+ if (test_bit(B_READING, &b->state) ||
+ test_bit(B_WRITING, &b->state) ||
+ test_bit(B_DIRTY, &b->state))
+ return false;
+ }
+
+ if (b->hold_count)
+ return false;
+
+ __make_buffer_clean(b);
+ __unlink_buffer(b);
+ __free_buffer_wake(b);
+
+ return true;
+}
+
+static unsigned get_retain_buffers(struct dm_bufio_client *c)
+{
+ unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+ return retain_bytes / c->block_size;
+}
+
+static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+ gfp_t gfp_mask)
+{
+ int l;
+ struct dm_buffer *b, *tmp;
+ unsigned long freed = 0;
+ unsigned long count = nr_to_scan;
+ unsigned retain_target = get_retain_buffers(c);
+
+ for (l = 0; l < LIST_SIZE; l++) {
+ list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
+ if (__try_evict_buffer(b, gfp_mask))
+ freed++;
+ if (!--nr_to_scan || ((count - freed) <= retain_target))
+ return freed;
+ dm_bufio_cond_resched();
+ }
+ }
+ return freed;
+}
+
+static unsigned long
+dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct dm_bufio_client *c;
+ unsigned long freed;
+
+ c = container_of(shrink, struct dm_bufio_client, shrinker);
+ if (sc->gfp_mask & __GFP_FS)
+ dm_bufio_lock(c);
+ else if (!dm_bufio_trylock(c))
+ return SHRINK_STOP;
+
+ freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
+ dm_bufio_unlock(c);
+ return freed;
+}
+
+static unsigned long
+dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct dm_bufio_client *c;
+ unsigned long count;
+
+ c = container_of(shrink, struct dm_bufio_client, shrinker);
+ if (sc->gfp_mask & __GFP_FS)
+ dm_bufio_lock(c);
+ else if (!dm_bufio_trylock(c))
+ return 0;
+
+ count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+ dm_bufio_unlock(c);
+ return count;
+}
+
+/*
+ * Create the buffering interface
+ */
+struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
+ unsigned reserved_buffers, unsigned aux_size,
+ void (*alloc_callback)(struct dm_buffer *),
+ void (*write_callback)(struct dm_buffer *))
+{
+ int r;
+ struct dm_bufio_client *c;
+ unsigned i;
+
+ BUG_ON(block_size < 1 << SECTOR_SHIFT ||
+ (block_size & (block_size - 1)));
+
+ c = kzalloc(sizeof(*c), GFP_KERNEL);
+ if (!c) {
+ r = -ENOMEM;
+ goto bad_client;
+ }
+ c->buffer_tree = RB_ROOT;
+
+ c->bdev = bdev;
+ c->block_size = block_size;
+ c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
+ c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
+ ffs(block_size) - 1 - PAGE_SHIFT : 0;
+ c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
+ PAGE_SHIFT - (ffs(block_size) - 1) : 0);
+
+ c->aux_size = aux_size;
+ c->alloc_callback = alloc_callback;
+ c->write_callback = write_callback;
+
+ for (i = 0; i < LIST_SIZE; i++) {
+ INIT_LIST_HEAD(&c->lru[i]);
+ c->n_buffers[i] = 0;
+ }
+
+ mutex_init(&c->lock);
+ INIT_LIST_HEAD(&c->reserved_buffers);
+ c->need_reserved_buffers = reserved_buffers;
+
+ c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
+
+ init_waitqueue_head(&c->free_buffer_wait);
+ c->async_write_error = 0;
+
+ c->dm_io = dm_io_client_create();
+ if (IS_ERR(c->dm_io)) {
+ r = PTR_ERR(c->dm_io);
+ goto bad_dm_io;
+ }
+
+ mutex_lock(&dm_bufio_clients_lock);
+ if (c->blocks_per_page_bits) {
+ if (!DM_BUFIO_CACHE_NAME(c)) {
+ DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
+ if (!DM_BUFIO_CACHE_NAME(c)) {
+ r = -ENOMEM;
+ mutex_unlock(&dm_bufio_clients_lock);
+ goto bad_cache;
+ }
+ }
+
+ if (!DM_BUFIO_CACHE(c)) {
+ DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
+ c->block_size,
+ c->block_size, 0, NULL);
+ if (!DM_BUFIO_CACHE(c)) {
+ r = -ENOMEM;
+ mutex_unlock(&dm_bufio_clients_lock);
+ goto bad_cache;
+ }
+ }
+ }
+ mutex_unlock(&dm_bufio_clients_lock);
+
+ while (c->need_reserved_buffers) {
+ struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
+
+ if (!b) {
+ r = -ENOMEM;
+ goto bad_buffer;
+ }
+ __free_buffer_wake(b);
+ }
+
+ mutex_lock(&dm_bufio_clients_lock);
+ dm_bufio_client_count++;
+ list_add(&c->client_list, &dm_bufio_all_clients);
+ __cache_size_refresh();
+ mutex_unlock(&dm_bufio_clients_lock);
+
+ c->shrinker.count_objects = dm_bufio_shrink_count;
+ c->shrinker.scan_objects = dm_bufio_shrink_scan;
+ c->shrinker.seeks = 1;
+ c->shrinker.batch = 0;
+ register_shrinker(&c->shrinker);
+
+ return c;
+
+bad_buffer:
+bad_cache:
+ while (!list_empty(&c->reserved_buffers)) {
+ struct dm_buffer *b = list_entry(c->reserved_buffers.next,
+ struct dm_buffer, lru_list);
+ list_del(&b->lru_list);
+ free_buffer(b);
+ }
+ dm_io_client_destroy(c->dm_io);
+bad_dm_io:
+ kfree(c);
+bad_client:
+ return ERR_PTR(r);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_client_create);
+
+/*
+ * Free the buffering interface.
+ * It is required that there are no references on any buffers.
+ */
+void dm_bufio_client_destroy(struct dm_bufio_client *c)
+{
+ unsigned i;
+
+ drop_buffers(c);
+
+ unregister_shrinker(&c->shrinker);
+
+ mutex_lock(&dm_bufio_clients_lock);
+
+ list_del(&c->client_list);
+ dm_bufio_client_count--;
+ __cache_size_refresh();
+
+ mutex_unlock(&dm_bufio_clients_lock);
+
+ BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
+ BUG_ON(c->need_reserved_buffers);
+
+ while (!list_empty(&c->reserved_buffers)) {
+ struct dm_buffer *b = list_entry(c->reserved_buffers.next,
+ struct dm_buffer, lru_list);
+ list_del(&b->lru_list);
+ free_buffer(b);
+ }
+
+ for (i = 0; i < LIST_SIZE; i++)
+ if (c->n_buffers[i])
+ DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
+
+ for (i = 0; i < LIST_SIZE; i++)
+ BUG_ON(c->n_buffers[i]);
+
+ dm_io_client_destroy(c->dm_io);
+ kfree(c);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
+
+static unsigned get_max_age_hz(void)
+{
+ unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
+
+ if (max_age > UINT_MAX / HZ)
+ max_age = UINT_MAX / HZ;
+
+ return max_age * HZ;
+}
+
+static bool older_than(struct dm_buffer *b, unsigned long age_hz)
+{
+ return time_after_eq(jiffies, b->last_accessed + age_hz);
+}
+
+static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
+{
+ struct dm_buffer *b, *tmp;
+ unsigned retain_target = get_retain_buffers(c);
+ unsigned count;
+
+ dm_bufio_lock(c);
+
+ count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+ list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
+ if (count <= retain_target)
+ break;
+
+ if (!older_than(b, age_hz))
+ break;
+
+ if (__try_evict_buffer(b, 0))
+ count--;
+
+ dm_bufio_cond_resched();
+ }
+
+ dm_bufio_unlock(c);
+}
+
+static void cleanup_old_buffers(void)
+{
+ unsigned long max_age_hz = get_max_age_hz();
+ struct dm_bufio_client *c;
+
+ mutex_lock(&dm_bufio_clients_lock);
+
+ list_for_each_entry(c, &dm_bufio_all_clients, client_list)
+ __evict_old_buffers(c, max_age_hz);
+
+ mutex_unlock(&dm_bufio_clients_lock);
+}
+
+static struct workqueue_struct *dm_bufio_wq;
+static struct delayed_work dm_bufio_work;
+
+static void work_fn(struct work_struct *w)
+{
+ cleanup_old_buffers();
+
+ queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
+ DM_BUFIO_WORK_TIMER_SECS * HZ);
+}
+
+/*----------------------------------------------------------------
+ * Module setup
+ *--------------------------------------------------------------*/
+
+/*
+ * This is called only once for the whole dm_bufio module.
+ * It initializes memory limit.
+ */
+static int __init dm_bufio_init(void)
+{
+ __u64 mem;
+
+ dm_bufio_allocated_kmem_cache = 0;
+ dm_bufio_allocated_get_free_pages = 0;
+ dm_bufio_allocated_vmalloc = 0;
+ dm_bufio_current_allocated = 0;
+
+ memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
+ memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
+
+ mem = (__u64)((totalram_pages - totalhigh_pages) *
+ DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
+
+ if (mem > ULONG_MAX)
+ mem = ULONG_MAX;
+
+#ifdef CONFIG_MMU
+ /*
+ * Get the size of vmalloc space the same way as VMALLOC_TOTAL
+ * in fs/proc/internal.h
+ */
+ if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
+ mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
+#endif
+
+ dm_bufio_default_cache_size = mem;
+
+ mutex_lock(&dm_bufio_clients_lock);
+ __cache_size_refresh();
+ mutex_unlock(&dm_bufio_clients_lock);
+
+ dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
+ if (!dm_bufio_wq)
+ return -ENOMEM;
+
+ INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
+ queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
+ DM_BUFIO_WORK_TIMER_SECS * HZ);
+
+ return 0;
+}
+
+/*
+ * This is called once when unloading the dm_bufio module.
+ */
+static void __exit dm_bufio_exit(void)
+{
+ int bug = 0;
+ int i;
+
+ cancel_delayed_work_sync(&dm_bufio_work);
+ destroy_workqueue(dm_bufio_wq);
+
+ for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
+ struct kmem_cache *kc = dm_bufio_caches[i];
+
+ if (kc)
+ kmem_cache_destroy(kc);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
+ kfree(dm_bufio_cache_names[i]);
+
+ if (dm_bufio_client_count) {
+ DMCRIT("%s: dm_bufio_client_count leaked: %d",
+ __func__, dm_bufio_client_count);
+ bug = 1;
+ }
+
+ if (dm_bufio_current_allocated) {
+ DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
+ __func__, dm_bufio_current_allocated);
+ bug = 1;
+ }
+
+ if (dm_bufio_allocated_get_free_pages) {
+ DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
+ __func__, dm_bufio_allocated_get_free_pages);
+ bug = 1;
+ }
+
+ if (dm_bufio_allocated_vmalloc) {
+ DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
+ __func__, dm_bufio_allocated_vmalloc);
+ bug = 1;
+ }
+
+ if (bug)
+ BUG();
+}
+
+module_init(dm_bufio_init)
+module_exit(dm_bufio_exit)
+
+module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
+
+module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
+
+module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
+
+module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
+
+module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
+MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
+
+module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
+MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
+
+module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
+MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
+
+module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
+MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
+
+MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
+MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
new file mode 100644
index 000000000..c096779a7
--- /dev/null
+++ b/drivers/md/dm-bufio.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2009-2011 Red Hat, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BUFIO_H
+#define DM_BUFIO_H
+
+#include <linux/blkdev.h>
+#include <linux/types.h>
+
+/*----------------------------------------------------------------*/
+
+struct dm_bufio_client;
+struct dm_buffer;
+
+/*
+ * Create a buffered IO cache on a given device
+ */
+struct dm_bufio_client *
+dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
+ unsigned reserved_buffers, unsigned aux_size,
+ void (*alloc_callback)(struct dm_buffer *),
+ void (*write_callback)(struct dm_buffer *));
+
+/*
+ * Release a buffered IO cache.
+ */
+void dm_bufio_client_destroy(struct dm_bufio_client *c);
+
+/*
+ * WARNING: to avoid deadlocks, these conditions are observed:
+ *
+ * - At most one thread can hold at most "reserved_buffers" simultaneously.
+ * - Each other threads can hold at most one buffer.
+ * - Threads which call only dm_bufio_get can hold unlimited number of
+ * buffers.
+ */
+
+/*
+ * Read a given block from disk. Returns pointer to data. Returns a
+ * pointer to dm_buffer that can be used to release the buffer or to make
+ * it dirty.
+ */
+void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
+ struct dm_buffer **bp);
+
+/*
+ * Like dm_bufio_read, but return buffer from cache, don't read
+ * it. If the buffer is not in the cache, return NULL.
+ */
+void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
+ struct dm_buffer **bp);
+
+/*
+ * Like dm_bufio_read, but don't read anything from the disk. It is
+ * expected that the caller initializes the buffer and marks it dirty.
+ */
+void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
+ struct dm_buffer **bp);
+
+/*
+ * Prefetch the specified blocks to the cache.
+ * The function starts to read the blocks and returns without waiting for
+ * I/O to finish.
+ */
+void dm_bufio_prefetch(struct dm_bufio_client *c,
+ sector_t block, unsigned n_blocks);
+
+/*
+ * Release a reference obtained with dm_bufio_{read,get,new}. The data
+ * pointer and dm_buffer pointer is no longer valid after this call.
+ */
+void dm_bufio_release(struct dm_buffer *b);
+
+/*
+ * Mark a buffer dirty. It should be called after the buffer is modified.
+ *
+ * In case of memory pressure, the buffer may be written after
+ * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers. So
+ * dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk but
+ * the actual writing may occur earlier.
+ */
+void dm_bufio_mark_buffer_dirty(struct dm_buffer *b);
+
+/*
+ * Initiate writing of dirty buffers, without waiting for completion.
+ */
+void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c);
+
+/*
+ * Write all dirty buffers. Guarantees that all dirty buffers created prior
+ * to this call are on disk when this call exits.
+ */
+int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c);
+
+/*
+ * Send an empty write barrier to the device to flush hardware disk cache.
+ */
+int dm_bufio_issue_flush(struct dm_bufio_client *c);
+
+/*
+ * Like dm_bufio_release but also move the buffer to the new
+ * block. dm_bufio_write_dirty_buffers is needed to commit the new block.
+ */
+void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
+
+/*
+ * Free the given buffer.
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
+
+/*
+ * Set the minimum number of buffers before cleanup happens.
+ */
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n);
+
+unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
+sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
+sector_t dm_bufio_get_block_number(struct dm_buffer *b);
+void *dm_bufio_get_block_data(struct dm_buffer *b);
+void *dm_bufio_get_aux_data(struct dm_buffer *b);
+struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b);
+
+/*----------------------------------------------------------------*/
+
+#endif
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c
new file mode 100644
index 000000000..6c9049c51
--- /dev/null
+++ b/drivers/md/dm-builtin.c
@@ -0,0 +1,48 @@
+#include "dm.h"
+
+/*
+ * The kobject release method must not be placed in the module itself,
+ * otherwise we are subject to module unload races.
+ *
+ * The release method is called when the last reference to the kobject is
+ * dropped. It may be called by any other kernel code that drops the last
+ * reference.
+ *
+ * The release method suffers from module unload race. We may prevent the
+ * module from being unloaded at the start of the release method (using
+ * increased module reference count or synchronizing against the release
+ * method), however there is no way to prevent the module from being
+ * unloaded at the end of the release method.
+ *
+ * If this code were placed in the dm module, the following race may
+ * happen:
+ * 1. Some other process takes a reference to dm kobject
+ * 2. The user issues ioctl function to unload the dm device
+ * 3. dm_sysfs_exit calls kobject_put, however the object is not released
+ * because of the other reference taken at step 1
+ * 4. dm_sysfs_exit waits on the completion
+ * 5. The other process that took the reference in step 1 drops it,
+ * dm_kobject_release is called from this process
+ * 6. dm_kobject_release calls complete()
+ * 7. a reschedule happens before dm_kobject_release returns
+ * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference
+ * count is decremented
+ * 9. The user unloads the dm module
+ * 10. The other process that was rescheduled in step 7 continues to run,
+ * it is now executing code in unloaded module, so it crashes
+ *
+ * Note that if the process that takes the foreign reference to dm kobject
+ * has a low priority and the system is sufficiently loaded with
+ * higher-priority processes that prevent the low-priority process from
+ * being scheduled long enough, this bug may really happen.
+ *
+ * In order to fix this module unload race, we place the release method
+ * into a helper code that is compiled directly into the kernel.
+ */
+
+void dm_kobject_release(struct kobject *kobj)
+{
+ complete(dm_get_completion_from_kobject(kobj));
+}
+
+EXPORT_SYMBOL(dm_kobject_release);
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
new file mode 100644
index 000000000..bed4ad4e1
--- /dev/null
+++ b/drivers/md/dm-cache-block-types.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_BLOCK_TYPES_H
+#define DM_CACHE_BLOCK_TYPES_H
+
+#include "persistent-data/dm-block-manager.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * It's helpful to get sparse to differentiate between indexes into the
+ * origin device, indexes into the cache device, and indexes into the
+ * discard bitset.
+ */
+
+typedef dm_block_t __bitwise__ dm_oblock_t;
+typedef uint32_t __bitwise__ dm_cblock_t;
+typedef dm_block_t __bitwise__ dm_dblock_t;
+
+static inline dm_oblock_t to_oblock(dm_block_t b)
+{
+ return (__force dm_oblock_t) b;
+}
+
+static inline dm_block_t from_oblock(dm_oblock_t b)
+{
+ return (__force dm_block_t) b;
+}
+
+static inline dm_cblock_t to_cblock(uint32_t b)
+{
+ return (__force dm_cblock_t) b;
+}
+
+static inline uint32_t from_cblock(dm_cblock_t b)
+{
+ return (__force uint32_t) b;
+}
+
+static inline dm_dblock_t to_dblock(dm_block_t b)
+{
+ return (__force dm_dblock_t) b;
+}
+
+static inline dm_block_t from_dblock(dm_dblock_t b)
+{
+ return (__force dm_block_t) b;
+}
+
+#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
new file mode 100644
index 000000000..c1c010498
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.c
@@ -0,0 +1,1389 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-metadata.h"
+
+#include "persistent-data/dm-array.h"
+#include "persistent-data/dm-bitset.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-disk.h"
+#include "persistent-data/dm-transaction-manager.h"
+
+#include <linux/device-mapper.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "cache metadata"
+
+#define CACHE_SUPERBLOCK_MAGIC 06142003
+#define CACHE_SUPERBLOCK_LOCATION 0
+
+/*
+ * defines a range of metadata versions that this module can handle.
+ */
+#define MIN_CACHE_VERSION 1
+#define MAX_CACHE_VERSION 1
+
+#define CACHE_METADATA_CACHE_SIZE 64
+
+/*
+ * 3 for btree insert +
+ * 2 for btree lookup used within space map
+ */
+#define CACHE_MAX_CONCURRENT_LOCKS 5
+#define SPACE_MAP_ROOT_SIZE 128
+
+enum superblock_flag_bits {
+ /* for spotting crashes that would invalidate the dirty bitset */
+ CLEAN_SHUTDOWN,
+};
+
+/*
+ * Each mapping from cache block -> origin block carries a set of flags.
+ */
+enum mapping_bits {
+ /*
+ * A valid mapping. Because we're using an array we clear this
+ * flag for an non existant mapping.
+ */
+ M_VALID = 1,
+
+ /*
+ * The data on the cache is different from that on the origin.
+ */
+ M_DIRTY = 2
+};
+
+struct cache_disk_superblock {
+ __le32 csum;
+ __le32 flags;
+ __le64 blocknr;
+
+ __u8 uuid[16];
+ __le64 magic;
+ __le32 version;
+
+ __u8 policy_name[CACHE_POLICY_NAME_SIZE];
+ __le32 policy_hint_size;
+
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+ __le64 mapping_root;
+ __le64 hint_root;
+
+ __le64 discard_root;
+ __le64 discard_block_size;
+ __le64 discard_nr_blocks;
+
+ __le32 data_block_size;
+ __le32 metadata_block_size;
+ __le32 cache_blocks;
+
+ __le32 compat_flags;
+ __le32 compat_ro_flags;
+ __le32 incompat_flags;
+
+ __le32 read_hits;
+ __le32 read_misses;
+ __le32 write_hits;
+ __le32 write_misses;
+
+ __le32 policy_version[CACHE_POLICY_VERSION_SIZE];
+} __packed;
+
+struct dm_cache_metadata {
+ atomic_t ref_count;
+ struct list_head list;
+
+ struct block_device *bdev;
+ struct dm_block_manager *bm;
+ struct dm_space_map *metadata_sm;
+ struct dm_transaction_manager *tm;
+
+ struct dm_array_info info;
+ struct dm_array_info hint_info;
+ struct dm_disk_bitset discard_info;
+
+ struct rw_semaphore root_lock;
+ dm_block_t root;
+ dm_block_t hint_root;
+ dm_block_t discard_root;
+
+ sector_t discard_block_size;
+ dm_dblock_t discard_nr_blocks;
+
+ sector_t data_block_size;
+ dm_cblock_t cache_blocks;
+ bool changed:1;
+ bool clean_when_opened:1;
+
+ char policy_name[CACHE_POLICY_NAME_SIZE];
+ unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
+ size_t policy_hint_size;
+ struct dm_cache_statistics stats;
+
+ /*
+ * Reading the space map root can fail, so we read it into this
+ * buffer before the superblock is locked and updated.
+ */
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+};
+
+/*-------------------------------------------------------------------
+ * superblock validator
+ *-----------------------------------------------------------------*/
+
+#define SUPERBLOCK_CSUM_XOR 9031977
+
+static void sb_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t sb_block_size)
+{
+ struct cache_disk_superblock *disk_super = dm_block_data(b);
+
+ disk_super->blocknr = cpu_to_le64(dm_block_location(b));
+ disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+ sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR));
+}
+
+static int check_metadata_version(struct cache_disk_superblock *disk_super)
+{
+ uint32_t metadata_version = le32_to_cpu(disk_super->version);
+ if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
+ DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
+ metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int sb_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t sb_block_size)
+{
+ struct cache_disk_superblock *disk_super = dm_block_data(b);
+ __le32 csum_le;
+
+ if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
+ DMERR("sb_check failed: blocknr %llu: wanted %llu",
+ le64_to_cpu(disk_super->blocknr),
+ (unsigned long long)dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) {
+ DMERR("sb_check failed: magic %llu: wanted %llu",
+ le64_to_cpu(disk_super->magic),
+ (unsigned long long)CACHE_SUPERBLOCK_MAGIC);
+ return -EILSEQ;
+ }
+
+ csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+ sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR));
+ if (csum_le != disk_super->csum) {
+ DMERR("sb_check failed: csum %u: wanted %u",
+ le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
+ return -EILSEQ;
+ }
+
+ return check_metadata_version(disk_super);
+}
+
+static struct dm_block_validator sb_validator = {
+ .name = "superblock",
+ .prepare_for_write = sb_prepare_for_write,
+ .check = sb_check
+};
+
+/*----------------------------------------------------------------*/
+
+static int superblock_read_lock(struct dm_cache_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+static int superblock_lock_zero(struct dm_cache_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+static int superblock_lock(struct dm_cache_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+/*----------------------------------------------------------------*/
+
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
+{
+ int r;
+ unsigned i;
+ struct dm_block *b;
+ __le64 *data_le, zero = cpu_to_le64(0);
+ unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
+
+ /*
+ * We can't use a validator here - it may be all zeroes.
+ */
+ r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b);
+ if (r)
+ return r;
+
+ data_le = dm_block_data(b);
+ *result = true;
+ for (i = 0; i < sb_block_size; i++) {
+ if (data_le[i] != zero) {
+ *result = false;
+ break;
+ }
+ }
+
+ return dm_bm_unlock(b);
+}
+
+static void __setup_mapping_info(struct dm_cache_metadata *cmd)
+{
+ struct dm_btree_value_type vt;
+
+ vt.context = NULL;
+ vt.size = sizeof(__le64);
+ vt.inc = NULL;
+ vt.dec = NULL;
+ vt.equal = NULL;
+ dm_array_info_init(&cmd->info, cmd->tm, &vt);
+
+ if (cmd->policy_hint_size) {
+ vt.size = sizeof(__le32);
+ dm_array_info_init(&cmd->hint_info, cmd->tm, &vt);
+ }
+}
+
+static int __save_sm_root(struct dm_cache_metadata *cmd)
+{
+ int r;
+ size_t metadata_len;
+
+ r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+ if (r < 0)
+ return r;
+
+ return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root,
+ metadata_len);
+}
+
+static void __copy_sm_root(struct dm_cache_metadata *cmd,
+ struct cache_disk_superblock *disk_super)
+{
+ memcpy(&disk_super->metadata_space_map_root,
+ &cmd->metadata_space_map_root,
+ sizeof(cmd->metadata_space_map_root));
+}
+
+static int __write_initial_superblock(struct dm_cache_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct cache_disk_superblock *disk_super;
+ sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
+
+ /* FIXME: see if we can lose the max sectors limit */
+ if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
+ bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
+
+ r = dm_tm_pre_commit(cmd->tm);
+ if (r < 0)
+ return r;
+
+ /*
+ * dm_sm_copy_root() can fail. So we need to do it before we start
+ * updating the superblock.
+ */
+ r = __save_sm_root(cmd);
+ if (r)
+ return r;
+
+ r = superblock_lock_zero(cmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ disk_super->flags = 0;
+ memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
+ disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
+ disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
+ memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
+ memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
+ disk_super->policy_hint_size = 0;
+
+ __copy_sm_root(cmd, disk_super);
+
+ disk_super->mapping_root = cpu_to_le64(cmd->root);
+ disk_super->hint_root = cpu_to_le64(cmd->hint_root);
+ disk_super->discard_root = cpu_to_le64(cmd->discard_root);
+ disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
+ disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
+ disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
+ disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
+ disk_super->cache_blocks = cpu_to_le32(0);
+
+ disk_super->read_hits = cpu_to_le32(0);
+ disk_super->read_misses = cpu_to_le32(0);
+ disk_super->write_hits = cpu_to_le32(0);
+ disk_super->write_misses = cpu_to_le32(0);
+
+ return dm_tm_commit(cmd->tm, sblock);
+}
+
+static int __format_metadata(struct dm_cache_metadata *cmd)
+{
+ int r;
+
+ r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ &cmd->tm, &cmd->metadata_sm);
+ if (r < 0) {
+ DMERR("tm_create_with_sm failed");
+ return r;
+ }
+
+ __setup_mapping_info(cmd);
+
+ r = dm_array_empty(&cmd->info, &cmd->root);
+ if (r < 0)
+ goto bad;
+
+ dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
+
+ r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
+ if (r < 0)
+ goto bad;
+
+ cmd->discard_block_size = 0;
+ cmd->discard_nr_blocks = 0;
+
+ r = __write_initial_superblock(cmd);
+ if (r)
+ goto bad;
+
+ cmd->clean_when_opened = true;
+ return 0;
+
+bad:
+ dm_tm_destroy(cmd->tm);
+ dm_sm_destroy(cmd->metadata_sm);
+
+ return r;
+}
+
+static int __check_incompat_features(struct cache_disk_superblock *disk_super,
+ struct dm_cache_metadata *cmd)
+{
+ uint32_t features;
+
+ features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
+ if (features) {
+ DMERR("could not access metadata due to unsupported optional features (%lx).",
+ (unsigned long)features);
+ return -EINVAL;
+ }
+
+ /*
+ * Check for read-only metadata to skip the following RDWR checks.
+ */
+ if (get_disk_ro(cmd->bdev->bd_disk))
+ return 0;
+
+ features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP;
+ if (features) {
+ DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
+ (unsigned long)features);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __open_metadata(struct dm_cache_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct cache_disk_superblock *disk_super;
+ unsigned long sb_flags;
+
+ r = superblock_read_lock(cmd, &sblock);
+ if (r < 0) {
+ DMERR("couldn't read lock superblock");
+ return r;
+ }
+
+ disk_super = dm_block_data(sblock);
+
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk_super->data_block_size) != cmd->data_block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk_super->data_block_size),
+ (unsigned long long)cmd->data_block_size);
+ r = -EINVAL;
+ goto bad;
+ }
+
+ r = __check_incompat_features(disk_super, cmd);
+ if (r < 0)
+ goto bad;
+
+ r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+ disk_super->metadata_space_map_root,
+ sizeof(disk_super->metadata_space_map_root),
+ &cmd->tm, &cmd->metadata_sm);
+ if (r < 0) {
+ DMERR("tm_open_with_sm failed");
+ goto bad;
+ }
+
+ __setup_mapping_info(cmd);
+ dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
+ sb_flags = le32_to_cpu(disk_super->flags);
+ cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
+ return dm_bm_unlock(sblock);
+
+bad:
+ dm_bm_unlock(sblock);
+ return r;
+}
+
+static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
+ bool format_device)
+{
+ int r;
+ bool unformatted = false;
+
+ r = __superblock_all_zeroes(cmd->bm, &unformatted);
+ if (r)
+ return r;
+
+ if (unformatted)
+ return format_device ? __format_metadata(cmd) : -EPERM;
+
+ return __open_metadata(cmd);
+}
+
+static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
+ bool may_format_device)
+{
+ int r;
+ cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
+ CACHE_METADATA_CACHE_SIZE,
+ CACHE_MAX_CONCURRENT_LOCKS);
+ if (IS_ERR(cmd->bm)) {
+ DMERR("could not create block manager");
+ return PTR_ERR(cmd->bm);
+ }
+
+ r = __open_or_format_metadata(cmd, may_format_device);
+ if (r)
+ dm_block_manager_destroy(cmd->bm);
+
+ return r;
+}
+
+static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd)
+{
+ dm_sm_destroy(cmd->metadata_sm);
+ dm_tm_destroy(cmd->tm);
+ dm_block_manager_destroy(cmd->bm);
+}
+
+typedef unsigned long (*flags_mutator)(unsigned long);
+
+static void update_flags(struct cache_disk_superblock *disk_super,
+ flags_mutator mutator)
+{
+ uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags));
+ disk_super->flags = cpu_to_le32(sb_flags);
+}
+
+static unsigned long set_clean_shutdown(unsigned long flags)
+{
+ set_bit(CLEAN_SHUTDOWN, &flags);
+ return flags;
+}
+
+static unsigned long clear_clean_shutdown(unsigned long flags)
+{
+ clear_bit(CLEAN_SHUTDOWN, &flags);
+ return flags;
+}
+
+static void read_superblock_fields(struct dm_cache_metadata *cmd,
+ struct cache_disk_superblock *disk_super)
+{
+ cmd->root = le64_to_cpu(disk_super->mapping_root);
+ cmd->hint_root = le64_to_cpu(disk_super->hint_root);
+ cmd->discard_root = le64_to_cpu(disk_super->discard_root);
+ cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
+ cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
+ cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
+ cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
+ strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
+ cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]);
+ cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]);
+ cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]);
+ cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
+
+ cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
+ cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses);
+ cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
+ cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
+
+ cmd->changed = false;
+}
+
+/*
+ * The mutator updates the superblock flags.
+ */
+static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
+ flags_mutator mutator)
+{
+ int r;
+ struct cache_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ r = superblock_lock(cmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ update_flags(disk_super, mutator);
+ read_superblock_fields(cmd, disk_super);
+ dm_bm_unlock(sblock);
+
+ return dm_bm_flush(cmd->bm);
+}
+
+static int __begin_transaction(struct dm_cache_metadata *cmd)
+{
+ int r;
+ struct cache_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ /*
+ * We re-read the superblock every time. Shouldn't need to do this
+ * really.
+ */
+ r = superblock_read_lock(cmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ read_superblock_fields(cmd, disk_super);
+ dm_bm_unlock(sblock);
+
+ return 0;
+}
+
+static int __commit_transaction(struct dm_cache_metadata *cmd,
+ flags_mutator mutator)
+{
+ int r;
+ struct cache_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ /*
+ * We need to know if the cache_disk_superblock exceeds a 512-byte sector.
+ */
+ BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
+
+ r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
+ &cmd->discard_root);
+ if (r)
+ return r;
+
+ r = dm_tm_pre_commit(cmd->tm);
+ if (r < 0)
+ return r;
+
+ r = __save_sm_root(cmd);
+ if (r)
+ return r;
+
+ r = superblock_lock(cmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+
+ if (mutator)
+ update_flags(disk_super, mutator);
+
+ disk_super->mapping_root = cpu_to_le64(cmd->root);
+ disk_super->hint_root = cpu_to_le64(cmd->hint_root);
+ disk_super->discard_root = cpu_to_le64(cmd->discard_root);
+ disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
+ disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
+ disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
+ strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
+ disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
+ disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
+ disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
+
+ disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
+ disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
+ disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
+ disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
+ __copy_sm_root(cmd, disk_super);
+
+ return dm_tm_commit(cmd->tm, sblock);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The mappings are held in a dm-array that has 64-bit values stored in
+ * little-endian format. The index is the cblock, the high 48bits of the
+ * value are the oblock and the low 16 bit the flags.
+ */
+#define FLAGS_MASK ((1 << 16) - 1)
+
+static __le64 pack_value(dm_oblock_t block, unsigned flags)
+{
+ uint64_t value = from_oblock(block);
+ value <<= 16;
+ value = value | (flags & FLAGS_MASK);
+ return cpu_to_le64(value);
+}
+
+static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
+{
+ uint64_t value = le64_to_cpu(value_le);
+ uint64_t b = value >> 16;
+ *block = to_oblock(b);
+ *flags = value & FLAGS_MASK;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size)
+{
+ int r;
+ struct dm_cache_metadata *cmd;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd) {
+ DMERR("could not allocate metadata struct");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ atomic_set(&cmd->ref_count, 1);
+ init_rwsem(&cmd->root_lock);
+ cmd->bdev = bdev;
+ cmd->data_block_size = data_block_size;
+ cmd->cache_blocks = 0;
+ cmd->policy_hint_size = policy_hint_size;
+ cmd->changed = true;
+
+ r = __create_persistent_data_objects(cmd, may_format_device);
+ if (r) {
+ kfree(cmd);
+ return ERR_PTR(r);
+ }
+
+ r = __begin_transaction_flags(cmd, clear_clean_shutdown);
+ if (r < 0) {
+ dm_cache_metadata_close(cmd);
+ return ERR_PTR(r);
+ }
+
+ return cmd;
+}
+
+/*
+ * We keep a little list of ref counted metadata objects to prevent two
+ * different target instances creating separate bufio instances. This is
+ * an issue if a table is reloaded before the suspend.
+ */
+static DEFINE_MUTEX(table_lock);
+static LIST_HEAD(table);
+
+static struct dm_cache_metadata *lookup(struct block_device *bdev)
+{
+ struct dm_cache_metadata *cmd;
+
+ list_for_each_entry(cmd, &table, list)
+ if (cmd->bdev == bdev) {
+ atomic_inc(&cmd->ref_count);
+ return cmd;
+ }
+
+ return NULL;
+}
+
+static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size)
+{
+ struct dm_cache_metadata *cmd, *cmd2;
+
+ mutex_lock(&table_lock);
+ cmd = lookup(bdev);
+ mutex_unlock(&table_lock);
+
+ if (cmd)
+ return cmd;
+
+ cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size);
+ if (!IS_ERR(cmd)) {
+ mutex_lock(&table_lock);
+ cmd2 = lookup(bdev);
+ if (cmd2) {
+ mutex_unlock(&table_lock);
+ __destroy_persistent_data_objects(cmd);
+ kfree(cmd);
+ return cmd2;
+ }
+ list_add(&cmd->list, &table);
+ mutex_unlock(&table_lock);
+ }
+
+ return cmd;
+}
+
+static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size)
+{
+ if (cmd->data_block_size != data_block_size) {
+ DMERR("data_block_size (%llu) different from that in metadata (%llu)\n",
+ (unsigned long long) data_block_size,
+ (unsigned long long) cmd->data_block_size);
+ return false;
+ }
+
+ return true;
+}
+
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size)
+{
+ struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size,
+ may_format_device, policy_hint_size);
+
+ if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) {
+ dm_cache_metadata_close(cmd);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return cmd;
+}
+
+void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
+{
+ if (atomic_dec_and_test(&cmd->ref_count)) {
+ mutex_lock(&table_lock);
+ list_del(&cmd->list);
+ mutex_unlock(&table_lock);
+
+ __destroy_persistent_data_objects(cmd);
+ kfree(cmd);
+ }
+}
+
+/*
+ * Checks that the given cache block is either unmapped or clean.
+ */
+static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
+ bool *result)
+{
+ int r;
+ __le64 value;
+ dm_oblock_t ob;
+ unsigned flags;
+
+ r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
+ if (r) {
+ DMERR("block_unmapped_or_clean failed");
+ return r;
+ }
+
+ unpack_value(value, &ob, &flags);
+ *result = !((flags & M_VALID) && (flags & M_DIRTY));
+
+ return 0;
+}
+
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
+{
+ int r;
+ *result = true;
+
+ while (begin != end) {
+ r = block_unmapped_or_clean(cmd, begin, result);
+ if (r)
+ return r;
+
+ if (!*result) {
+ DMERR("cache block %llu is dirty",
+ (unsigned long long) from_cblock(begin));
+ return 0;
+ }
+
+ begin = to_cblock(from_cblock(begin) + 1);
+ }
+
+ return 0;
+}
+
+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
+{
+ int r;
+ bool clean;
+ __le64 null_mapping = pack_value(0, 0);
+
+ down_write(&cmd->root_lock);
+ __dm_bless_for_disk(&null_mapping);
+
+ if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
+ r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean);
+ if (r) {
+ __dm_unbless_for_disk(&null_mapping);
+ goto out;
+ }
+
+ if (!clean) {
+ DMERR("unable to shrink cache due to dirty blocks");
+ r = -EINVAL;
+ __dm_unbless_for_disk(&null_mapping);
+ goto out;
+ }
+ }
+
+ r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
+ from_cblock(new_cache_size),
+ &null_mapping, &cmd->root);
+ if (!r)
+ cmd->cache_blocks = new_cache_size;
+ cmd->changed = true;
+
+out:
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
+ sector_t discard_block_size,
+ dm_dblock_t new_nr_entries)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = dm_bitset_resize(&cmd->discard_info,
+ cmd->discard_root,
+ from_dblock(cmd->discard_nr_blocks),
+ from_dblock(new_nr_entries),
+ false, &cmd->discard_root);
+ if (!r) {
+ cmd->discard_block_size = discard_block_size;
+ cmd->discard_nr_blocks = new_nr_entries;
+ }
+
+ cmd->changed = true;
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
+{
+ return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
+ from_dblock(b), &cmd->discard_root);
+}
+
+static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
+{
+ return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
+ from_dblock(b), &cmd->discard_root);
+}
+
+static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
+ bool *is_discarded)
+{
+ return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
+ from_dblock(b), &cmd->discard_root,
+ is_discarded);
+}
+
+static int __discard(struct dm_cache_metadata *cmd,
+ dm_dblock_t dblock, bool discard)
+{
+ int r;
+
+ r = (discard ? __set_discard : __clear_discard)(cmd, dblock);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return 0;
+}
+
+int dm_cache_set_discard(struct dm_cache_metadata *cmd,
+ dm_dblock_t dblock, bool discard)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = __discard(cmd, dblock, discard);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+static int __load_discards(struct dm_cache_metadata *cmd,
+ load_discard_fn fn, void *context)
+{
+ int r = 0;
+ dm_block_t b;
+ bool discard;
+
+ for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ dm_dblock_t dblock = to_dblock(b);
+
+ if (cmd->clean_when_opened) {
+ r = __is_discarded(cmd, dblock, &discard);
+ if (r)
+ return r;
+ } else
+ discard = false;
+
+ r = fn(context, cmd->discard_block_size, dblock, discard);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
+ load_discard_fn fn, void *context)
+{
+ int r;
+
+ down_read(&cmd->root_lock);
+ r = __load_discards(cmd, fn, context);
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
+{
+ dm_cblock_t r;
+
+ down_read(&cmd->root_lock);
+ r = cmd->cache_blocks;
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
+{
+ int r;
+ __le64 value = pack_value(0, 0);
+
+ __dm_bless_for_disk(&value);
+ r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+ &value, &cmd->root);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return 0;
+}
+
+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = __remove(cmd, cblock);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+static int __insert(struct dm_cache_metadata *cmd,
+ dm_cblock_t cblock, dm_oblock_t oblock)
+{
+ int r;
+ __le64 value = pack_value(oblock, M_VALID);
+ __dm_bless_for_disk(&value);
+
+ r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+ &value, &cmd->root);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return 0;
+}
+
+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
+ dm_cblock_t cblock, dm_oblock_t oblock)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = __insert(cmd, cblock, oblock);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+struct thunk {
+ load_mapping_fn fn;
+ void *context;
+
+ struct dm_cache_metadata *cmd;
+ bool respect_dirty_flags;
+ bool hints_valid;
+};
+
+static bool policy_unchanged(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy)
+{
+ const char *policy_name = dm_cache_policy_get_name(policy);
+ const unsigned *policy_version = dm_cache_policy_get_version(policy);
+ size_t policy_hint_size = dm_cache_policy_get_hint_size(policy);
+
+ /*
+ * Ensure policy names match.
+ */
+ if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name)))
+ return false;
+
+ /*
+ * Ensure policy major versions match.
+ */
+ if (cmd->policy_version[0] != policy_version[0])
+ return false;
+
+ /*
+ * Ensure policy hint sizes match.
+ */
+ if (cmd->policy_hint_size != policy_hint_size)
+ return false;
+
+ return true;
+}
+
+static bool hints_array_initialized(struct dm_cache_metadata *cmd)
+{
+ return cmd->hint_root && cmd->policy_hint_size;
+}
+
+static bool hints_array_available(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy)
+{
+ return cmd->clean_when_opened && policy_unchanged(cmd, policy) &&
+ hints_array_initialized(cmd);
+}
+
+static int __load_mapping(void *context, uint64_t cblock, void *leaf)
+{
+ int r = 0;
+ bool dirty;
+ __le64 value;
+ __le32 hint_value = 0;
+ dm_oblock_t oblock;
+ unsigned flags;
+ struct thunk *thunk = context;
+ struct dm_cache_metadata *cmd = thunk->cmd;
+
+ memcpy(&value, leaf, sizeof(value));
+ unpack_value(value, &oblock, &flags);
+
+ if (flags & M_VALID) {
+ if (thunk->hints_valid) {
+ r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
+ cblock, &hint_value);
+ if (r && r != -ENODATA)
+ return r;
+ }
+
+ dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
+ r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
+ dirty, le32_to_cpu(hint_value), thunk->hints_valid);
+ }
+
+ return r;
+}
+
+static int __load_mappings(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy,
+ load_mapping_fn fn, void *context)
+{
+ struct thunk thunk;
+
+ thunk.fn = fn;
+ thunk.context = context;
+
+ thunk.cmd = cmd;
+ thunk.respect_dirty_flags = cmd->clean_when_opened;
+ thunk.hints_valid = hints_array_available(cmd, policy);
+
+ return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
+}
+
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy,
+ load_mapping_fn fn, void *context)
+{
+ int r;
+
+ down_read(&cmd->root_lock);
+ r = __load_mappings(cmd, policy, fn, context);
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+static int __dump_mapping(void *context, uint64_t cblock, void *leaf)
+{
+ int r = 0;
+ __le64 value;
+ dm_oblock_t oblock;
+ unsigned flags;
+
+ memcpy(&value, leaf, sizeof(value));
+ unpack_value(value, &oblock, &flags);
+
+ return r;
+}
+
+static int __dump_mappings(struct dm_cache_metadata *cmd)
+{
+ return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL);
+}
+
+void dm_cache_dump(struct dm_cache_metadata *cmd)
+{
+ down_read(&cmd->root_lock);
+ __dump_mappings(cmd);
+ up_read(&cmd->root_lock);
+}
+
+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
+{
+ int r;
+
+ down_read(&cmd->root_lock);
+ r = cmd->changed;
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty)
+{
+ int r;
+ unsigned flags;
+ dm_oblock_t oblock;
+ __le64 value;
+
+ r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value);
+ if (r)
+ return r;
+
+ unpack_value(value, &oblock, &flags);
+
+ if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty))
+ /* nothing to be done */
+ return 0;
+
+ value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0));
+ __dm_bless_for_disk(&value);
+
+ r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+ &value, &cmd->root);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return 0;
+
+}
+
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
+ dm_cblock_t cblock, bool dirty)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = __dirty(cmd, cblock, dirty);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
+ struct dm_cache_statistics *stats)
+{
+ down_read(&cmd->root_lock);
+ *stats = cmd->stats;
+ up_read(&cmd->root_lock);
+}
+
+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
+ struct dm_cache_statistics *stats)
+{
+ down_write(&cmd->root_lock);
+ cmd->stats = *stats;
+ up_write(&cmd->root_lock);
+}
+
+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
+{
+ int r;
+ flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
+ clear_clean_shutdown);
+
+ down_write(&cmd->root_lock);
+ r = __commit_transaction(cmd, mutator);
+ if (r)
+ goto out;
+
+ r = __begin_transaction(cmd);
+
+out:
+ up_write(&cmd->root_lock);
+ return r;
+}
+
+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&cmd->root_lock);
+ r = dm_sm_get_nr_free(cmd->metadata_sm, result);
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&cmd->root_lock);
+ r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
+ up_read(&cmd->root_lock);
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+ int r;
+ __le32 value;
+ size_t hint_size;
+ const char *policy_name = dm_cache_policy_get_name(policy);
+ const unsigned *policy_version = dm_cache_policy_get_version(policy);
+
+ if (!policy_name[0] ||
+ (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
+ return -EINVAL;
+
+ if (!policy_unchanged(cmd, policy)) {
+ strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+ memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
+
+ hint_size = dm_cache_policy_get_hint_size(policy);
+ if (!hint_size)
+ return 0; /* short-circuit hints initialization */
+ cmd->policy_hint_size = hint_size;
+
+ if (cmd->hint_root) {
+ r = dm_array_del(&cmd->hint_info, cmd->hint_root);
+ if (r)
+ return r;
+ }
+
+ r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
+ if (r)
+ return r;
+
+ value = cpu_to_le32(0);
+ __dm_bless_for_disk(&value);
+ r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
+ from_cblock(cmd->cache_blocks),
+ &value, &cmd->hint_root);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint)
+{
+ struct dm_cache_metadata *cmd = context;
+ __le32 value = cpu_to_le32(hint);
+ int r;
+
+ __dm_bless_for_disk(&value);
+
+ r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
+ from_cblock(cblock), &value, &cmd->hint_root);
+ cmd->changed = true;
+
+ return r;
+}
+
+static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+ int r;
+
+ r = begin_hints(cmd, policy);
+ if (r) {
+ DMERR("begin_hints failed");
+ return r;
+ }
+
+ return policy_walk_mappings(policy, save_hint, cmd);
+}
+
+int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+ int r;
+
+ down_write(&cmd->root_lock);
+ r = write_hints(cmd, policy);
+ up_write(&cmd->root_lock);
+
+ return r;
+}
+
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
+{
+ return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
new file mode 100644
index 000000000..4ecc403be
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_METADATA_H
+#define DM_CACHE_METADATA_H
+
+#include "dm-cache-block-types.h"
+#include "dm-cache-policy-internal.h"
+#include "persistent-data/dm-space-map-metadata.h"
+
+/*----------------------------------------------------------------*/
+
+#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
+
+/* FIXME: remove this restriction */
+/*
+ * The metadata device is currently limited in size.
+ */
+#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
+
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Ext[234]-style compat feature flags.
+ *
+ * A new feature which old metadata will still be compatible with should
+ * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful).
+ *
+ * A new feature that is not compatible with old code should define a
+ * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with
+ * that flag.
+ *
+ * A new feature that is not compatible with old code accessing the
+ * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and
+ * guard the relevant code with that flag.
+ *
+ * As these various flags are defined they should be added to the
+ * following masks.
+ */
+#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL
+#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
+#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
+
+/*
+ * Reopens or creates a new, empty metadata volume.
+ * Returns an ERR_PTR on failure.
+ */
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool may_format_device,
+ size_t policy_hint_size);
+
+void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
+
+/*
+ * The metadata needs to know how many cache blocks there are. We don't
+ * care about the origin, assuming the core target is giving us valid
+ * origin blocks to map to.
+ */
+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
+
+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
+ sector_t discard_block_size,
+ dm_dblock_t new_nr_entries);
+
+typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
+ dm_dblock_t dblock, bool discarded);
+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
+ load_discard_fn fn, void *context);
+
+int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
+
+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd);
+
+typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
+ dm_cblock_t cblock, bool dirty,
+ uint32_t hint, bool hint_valid);
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy,
+ load_mapping_fn fn,
+ void *context);
+
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
+
+struct dm_cache_statistics {
+ uint32_t read_hits;
+ uint32_t read_misses;
+ uint32_t write_hits;
+ uint32_t write_misses;
+};
+
+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
+ struct dm_cache_statistics *stats);
+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
+ struct dm_cache_statistics *stats);
+
+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown);
+
+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
+ dm_block_t *result);
+
+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
+ dm_block_t *result);
+
+void dm_cache_dump(struct dm_cache_metadata *cmd);
+
+/*
+ * The policy is invited to save a 32bit hint value for every cblock (eg,
+ * for a hit count). These are stored against the policy name. If
+ * policies are changed, then hints will be lost. If the machine crashes,
+ * hints will be lost.
+ *
+ * The hints are indexed by the cblock, but many policies will not
+ * neccessarily have a fast way of accessing efficiently via cblock. So
+ * rather than querying the policy for each cblock, we let it walk its data
+ * structures and fill in the hints in whatever order it wishes.
+ */
+int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
+
+/*
+ * Query method. Are all the blocks in the cache clean?
+ */
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
new file mode 100644
index 000000000..004e463c9
--- /dev/null
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * writeback cache policy supporting flushing out dirty cache blocks.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "cache cleaner"
+
+/* Cache entry struct. */
+struct wb_cache_entry {
+ struct list_head list;
+ struct hlist_node hlist;
+
+ dm_oblock_t oblock;
+ dm_cblock_t cblock;
+ bool dirty:1;
+ bool pending:1;
+};
+
+struct hash {
+ struct hlist_head *table;
+ dm_block_t hash_bits;
+ unsigned nr_buckets;
+};
+
+struct policy {
+ struct dm_cache_policy policy;
+ spinlock_t lock;
+
+ struct list_head free;
+ struct list_head clean;
+ struct list_head clean_pending;
+ struct list_head dirty;
+
+ /*
+ * We know exactly how many cblocks will be needed,
+ * so we can allocate them up front.
+ */
+ dm_cblock_t cache_size, nr_cblocks_allocated;
+ struct wb_cache_entry *cblocks;
+ struct hash chash;
+};
+
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Low-level functions.
+ */
+static unsigned next_power(unsigned n, unsigned min)
+{
+ return roundup_pow_of_two(max(n, min));
+}
+
+static struct policy *to_policy(struct dm_cache_policy *p)
+{
+ return container_of(p, struct policy, policy);
+}
+
+static struct list_head *list_pop(struct list_head *q)
+{
+ struct list_head *r = q->next;
+
+ list_del(r);
+
+ return r;
+}
+
+/*----------------------------------------------------------------------------*/
+
+/* Allocate/free various resources. */
+static int alloc_hash(struct hash *hash, unsigned elts)
+{
+ hash->nr_buckets = next_power(elts >> 4, 16);
+ hash->hash_bits = ffs(hash->nr_buckets) - 1;
+ hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
+
+ return hash->table ? 0 : -ENOMEM;
+}
+
+static void free_hash(struct hash *hash)
+{
+ vfree(hash->table);
+}
+
+static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
+{
+ int r = -ENOMEM;
+
+ p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
+ if (p->cblocks) {
+ unsigned u = from_cblock(cache_size);
+
+ while (u--)
+ list_add(&p->cblocks[u].list, &p->free);
+
+ p->nr_cblocks_allocated = 0;
+
+ /* Cache entries hash. */
+ r = alloc_hash(&p->chash, from_cblock(cache_size));
+ if (r)
+ vfree(p->cblocks);
+ }
+
+ return r;
+}
+
+static void free_cache_blocks_and_hash(struct policy *p)
+{
+ free_hash(&p->chash);
+ vfree(p->cblocks);
+}
+
+static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
+{
+ struct wb_cache_entry *e;
+
+ BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
+
+ e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
+ p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
+
+ return e;
+}
+
+/*----------------------------------------------------------------------------*/
+
+/* Hash functions (lookup, insert, remove). */
+static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
+{
+ struct hash *hash = &p->chash;
+ unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
+ struct wb_cache_entry *cur;
+ struct hlist_head *bucket = &hash->table[h];
+
+ hlist_for_each_entry(cur, bucket, hlist) {
+ if (cur->oblock == oblock) {
+ /* Move upfront bucket for faster access. */
+ hlist_del(&cur->hlist);
+ hlist_add_head(&cur->hlist, bucket);
+ return cur;
+ }
+ }
+
+ return NULL;
+}
+
+static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
+{
+ unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
+
+ hlist_add_head(&e->hlist, &p->chash.table[h]);
+}
+
+static void remove_cache_hash_entry(struct wb_cache_entry *e)
+{
+ hlist_del(&e->hlist);
+}
+
+/* Public interface (see dm-cache-policy.h */
+static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool discarded_oblock,
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result)
+{
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ result->op = POLICY_MISS;
+
+ if (can_block)
+ spin_lock_irqsave(&p->lock, flags);
+
+ else if (!spin_trylock_irqsave(&p->lock, flags))
+ return -EWOULDBLOCK;
+
+ e = lookup_cache_entry(p, oblock);
+ if (e) {
+ result->op = POLICY_HIT;
+ result->cblock = e->cblock;
+
+ }
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ return 0;
+}
+
+static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+ int r;
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&p->lock, flags))
+ return -EWOULDBLOCK;
+
+ e = lookup_cache_entry(p, oblock);
+ if (e) {
+ *cblock = e->cblock;
+ r = 0;
+
+ } else
+ r = -ENOENT;
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ return r;
+}
+
+static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
+{
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+
+ e = lookup_cache_entry(p, oblock);
+ BUG_ON(!e);
+
+ if (set) {
+ if (!e->dirty) {
+ e->dirty = true;
+ list_move(&e->list, &p->dirty);
+ }
+
+ } else {
+ if (e->dirty) {
+ e->pending = false;
+ e->dirty = false;
+ list_move(&e->list, &p->clean);
+ }
+ }
+}
+
+static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+ struct policy *p = to_policy(pe);
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+ __set_clear_dirty(pe, oblock, true);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+ struct policy *p = to_policy(pe);
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+ __set_clear_dirty(pe, oblock, false);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
+{
+ insert_cache_hash_entry(p, e);
+ if (e->dirty)
+ list_add(&e->list, &p->dirty);
+ else
+ list_add(&e->list, &p->clean);
+}
+
+static int wb_load_mapping(struct dm_cache_policy *pe,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ uint32_t hint, bool hint_valid)
+{
+ int r;
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e = alloc_cache_entry(p);
+
+ if (e) {
+ e->cblock = cblock;
+ e->oblock = oblock;
+ e->dirty = false; /* blocks default to clean */
+ add_cache_entry(p, e);
+ r = 0;
+
+ } else
+ r = -ENOMEM;
+
+ return r;
+}
+
+static void wb_destroy(struct dm_cache_policy *pe)
+{
+ struct policy *p = to_policy(pe);
+
+ free_cache_blocks_and_hash(p);
+ kfree(p);
+}
+
+static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
+{
+ struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
+
+ BUG_ON(!r);
+
+ remove_cache_hash_entry(r);
+ list_del(&r->list);
+
+ return r;
+}
+
+static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+ e = __wb_force_remove_mapping(p, oblock);
+ list_add_tail(&e->list, &p->free);
+ BUG_ON(!from_cblock(p->nr_cblocks_allocated));
+ p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void wb_force_mapping(struct dm_cache_policy *pe,
+ dm_oblock_t current_oblock, dm_oblock_t oblock)
+{
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+ e = __wb_force_remove_mapping(p, current_oblock);
+ e->oblock = oblock;
+ add_cache_entry(p, e);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
+{
+ struct list_head *l;
+ struct wb_cache_entry *r;
+
+ if (list_empty(&p->dirty))
+ return NULL;
+
+ l = list_pop(&p->dirty);
+ r = container_of(l, struct wb_cache_entry, list);
+ list_add(l, &p->clean_pending);
+
+ return r;
+}
+
+static int wb_writeback_work(struct dm_cache_policy *pe,
+ dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
+{
+ int r = -ENOENT;
+ struct policy *p = to_policy(pe);
+ struct wb_cache_entry *e;
+ unsigned long flags;
+
+ spin_lock_irqsave(&p->lock, flags);
+
+ e = get_next_dirty_entry(p);
+ if (e) {
+ *oblock = e->oblock;
+ *cblock = e->cblock;
+ r = 0;
+ }
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ return r;
+}
+
+static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
+{
+ return to_policy(pe)->nr_cblocks_allocated;
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct policy *p)
+{
+ p->policy.destroy = wb_destroy;
+ p->policy.map = wb_map;
+ p->policy.lookup = wb_lookup;
+ p->policy.set_dirty = wb_set_dirty;
+ p->policy.clear_dirty = wb_clear_dirty;
+ p->policy.load_mapping = wb_load_mapping;
+ p->policy.walk_mappings = NULL;
+ p->policy.remove_mapping = wb_remove_mapping;
+ p->policy.writeback_work = wb_writeback_work;
+ p->policy.force_mapping = wb_force_mapping;
+ p->policy.residency = wb_residency;
+ p->policy.tick = NULL;
+}
+
+static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ int r;
+ struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
+
+ if (!p)
+ return NULL;
+
+ init_policy_functions(p);
+ INIT_LIST_HEAD(&p->free);
+ INIT_LIST_HEAD(&p->clean);
+ INIT_LIST_HEAD(&p->clean_pending);
+ INIT_LIST_HEAD(&p->dirty);
+
+ p->cache_size = cache_size;
+ spin_lock_init(&p->lock);
+
+ /* Allocate cache entry structs and add them to free list. */
+ r = alloc_cache_blocks_with_hash(p, cache_size);
+ if (!r)
+ return &p->policy;
+
+ kfree(p);
+
+ return NULL;
+}
+/*----------------------------------------------------------------------------*/
+
+static struct dm_cache_policy_type wb_policy_type = {
+ .name = "cleaner",
+ .version = {1, 0, 0},
+ .hint_size = 0,
+ .owner = THIS_MODULE,
+ .create = wb_create
+};
+
+static int __init wb_init(void)
+{
+ int r = dm_cache_policy_register(&wb_policy_type);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+ else
+ DMINFO("version %u.%u.%u loaded",
+ wb_policy_type.version[0],
+ wb_policy_type.version[1],
+ wb_policy_type.version[2]);
+
+ return r;
+}
+
+static void __exit wb_exit(void)
+{
+ dm_cache_policy_unregister(&wb_policy_type);
+}
+
+module_init(wb_init);
+module_exit(wb_exit);
+
+MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
new file mode 100644
index 000000000..c198e6def
--- /dev/null
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_POLICY_INTERNAL_H
+#define DM_CACHE_POLICY_INTERNAL_H
+
+#include "dm-cache-policy.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Little inline functions that simplify calling the policy methods.
+ */
+static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool discarded_oblock,
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result)
+{
+ return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
+}
+
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+ BUG_ON(!p->lookup);
+ return p->lookup(p, oblock, cblock);
+}
+
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ if (p->set_dirty)
+ p->set_dirty(p, oblock);
+}
+
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ if (p->clear_dirty)
+ p->clear_dirty(p, oblock);
+}
+
+static inline int policy_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ uint32_t hint, bool hint_valid)
+{
+ return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+}
+
+static inline int policy_walk_mappings(struct dm_cache_policy *p,
+ policy_walk_fn fn, void *context)
+{
+ return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
+}
+
+static inline int policy_writeback_work(struct dm_cache_policy *p,
+ dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
+{
+ return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+}
+
+static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ p->remove_mapping(p, oblock);
+}
+
+static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+ return p->remove_cblock(p, cblock);
+}
+
+static inline void policy_force_mapping(struct dm_cache_policy *p,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ return p->force_mapping(p, current_oblock, new_oblock);
+}
+
+static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
+{
+ return p->residency(p);
+}
+
+static inline void policy_tick(struct dm_cache_policy *p)
+{
+ if (p->tick)
+ return p->tick(p);
+}
+
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+{
+ ssize_t sz = 0;
+ if (p->emit_config_values)
+ return p->emit_config_values(p, result, maxlen);
+
+ DMEMIT("0");
+ return 0;
+}
+
+static inline int policy_set_config_value(struct dm_cache_policy *p,
+ const char *key, const char *value)
+{
+ return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
+ */
+struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
+ sector_t origin_size, sector_t block_size);
+
+/*
+ * Destroys the policy. This drops references to the policy module as well
+ * as calling it's destroy method. So always use this rather than calling
+ * the policy->destroy method directly.
+ */
+void dm_cache_policy_destroy(struct dm_cache_policy *p);
+
+/*
+ * In case we've forgotten.
+ */
+const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
+
+const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p);
+
+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_CACHE_POLICY_INTERNAL_H */
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
new file mode 100644
index 000000000..515d44bf2
--- /dev/null
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -0,0 +1,1491 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "cache-policy-mq"
+
+static struct kmem_cache *mq_entry_cache;
+
+/*----------------------------------------------------------------*/
+
+static unsigned next_power(unsigned n, unsigned min)
+{
+ return roundup_pow_of_two(max(n, min));
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Large, sequential ios are probably better left on the origin device since
+ * spindles tend to have good bandwidth.
+ *
+ * The io_tracker tries to spot when the io is in one of these sequential
+ * modes.
+ *
+ * Two thresholds to switch between random and sequential io mode are defaulting
+ * as follows and can be adjusted via the constructor and message interfaces.
+ */
+#define RANDOM_THRESHOLD_DEFAULT 4
+#define SEQUENTIAL_THRESHOLD_DEFAULT 512
+
+enum io_pattern {
+ PATTERN_SEQUENTIAL,
+ PATTERN_RANDOM
+};
+
+struct io_tracker {
+ enum io_pattern pattern;
+
+ unsigned nr_seq_samples;
+ unsigned nr_rand_samples;
+ unsigned thresholds[2];
+
+ dm_oblock_t last_end_oblock;
+};
+
+static void iot_init(struct io_tracker *t,
+ int sequential_threshold, int random_threshold)
+{
+ t->pattern = PATTERN_RANDOM;
+ t->nr_seq_samples = 0;
+ t->nr_rand_samples = 0;
+ t->last_end_oblock = 0;
+ t->thresholds[PATTERN_RANDOM] = random_threshold;
+ t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
+}
+
+static enum io_pattern iot_pattern(struct io_tracker *t)
+{
+ return t->pattern;
+}
+
+static void iot_update_stats(struct io_tracker *t, struct bio *bio)
+{
+ if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
+ t->nr_seq_samples++;
+ else {
+ /*
+ * Just one non-sequential IO is enough to reset the
+ * counters.
+ */
+ if (t->nr_seq_samples) {
+ t->nr_seq_samples = 0;
+ t->nr_rand_samples = 0;
+ }
+
+ t->nr_rand_samples++;
+ }
+
+ t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
+}
+
+static void iot_check_for_pattern_switch(struct io_tracker *t)
+{
+ switch (t->pattern) {
+ case PATTERN_SEQUENTIAL:
+ if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
+ t->pattern = PATTERN_RANDOM;
+ t->nr_seq_samples = t->nr_rand_samples = 0;
+ }
+ break;
+
+ case PATTERN_RANDOM:
+ if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
+ t->pattern = PATTERN_SEQUENTIAL;
+ t->nr_seq_samples = t->nr_rand_samples = 0;
+ }
+ break;
+ }
+}
+
+static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
+{
+ iot_update_stats(t, bio);
+ iot_check_for_pattern_switch(t);
+}
+
+/*----------------------------------------------------------------*/
+
+
+/*
+ * This queue is divided up into different levels. Allowing us to push
+ * entries to the back of any of the levels. Think of it as a partially
+ * sorted queue.
+ */
+#define NR_QUEUE_LEVELS 16u
+#define NR_SENTINELS NR_QUEUE_LEVELS * 3
+
+#define WRITEBACK_PERIOD HZ
+
+struct queue {
+ unsigned nr_elts;
+ bool current_writeback_sentinels;
+ unsigned long next_writeback;
+ struct list_head qs[NR_QUEUE_LEVELS];
+ struct list_head sentinels[NR_SENTINELS];
+};
+
+static void queue_init(struct queue *q)
+{
+ unsigned i;
+
+ q->nr_elts = 0;
+ q->current_writeback_sentinels = false;
+ q->next_writeback = 0;
+ for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+ INIT_LIST_HEAD(q->qs + i);
+ INIT_LIST_HEAD(q->sentinels + i);
+ INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
+ INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
+ }
+}
+
+static unsigned queue_size(struct queue *q)
+{
+ return q->nr_elts;
+}
+
+static bool queue_empty(struct queue *q)
+{
+ return q->nr_elts == 0;
+}
+
+/*
+ * Insert an entry to the back of the given level.
+ */
+static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
+{
+ q->nr_elts++;
+ list_add_tail(elt, q->qs + level);
+}
+
+static void queue_remove(struct queue *q, struct list_head *elt)
+{
+ q->nr_elts--;
+ list_del(elt);
+}
+
+static bool is_sentinel(struct queue *q, struct list_head *h)
+{
+ return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
+}
+
+/*
+ * Gives us the oldest entry of the lowest popoulated level. If the first
+ * level is emptied then we shift down one level.
+ */
+static struct list_head *queue_peek(struct queue *q)
+{
+ unsigned level;
+ struct list_head *h;
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ list_for_each(h, q->qs + level)
+ if (!is_sentinel(q, h))
+ return h;
+
+ return NULL;
+}
+
+static struct list_head *queue_pop(struct queue *q)
+{
+ struct list_head *r = queue_peek(q);
+
+ if (r) {
+ q->nr_elts--;
+ list_del(r);
+ }
+
+ return r;
+}
+
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct list_head *queue_pop_old(struct queue *q)
+{
+ unsigned level;
+ struct list_head *h;
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ list_for_each(h, q->qs + level) {
+ if (is_sentinel(q, h))
+ break;
+
+ q->nr_elts--;
+ list_del(h);
+ return h;
+ }
+
+ return NULL;
+}
+
+static struct list_head *list_pop(struct list_head *lh)
+{
+ struct list_head *r = lh->next;
+
+ BUG_ON(!r);
+ list_del_init(r);
+
+ return r;
+}
+
+static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
+{
+ if (q->current_writeback_sentinels)
+ return q->sentinels + NR_QUEUE_LEVELS + level;
+ else
+ return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
+}
+
+static void queue_update_writeback_sentinels(struct queue *q)
+{
+ unsigned i;
+ struct list_head *h;
+
+ if (time_after(jiffies, q->next_writeback)) {
+ for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+ h = writeback_sentinel(q, i);
+ list_del(h);
+ list_add_tail(h, q->qs + i);
+ }
+
+ q->next_writeback = jiffies + WRITEBACK_PERIOD;
+ q->current_writeback_sentinels = !q->current_writeback_sentinels;
+ }
+}
+
+/*
+ * Sometimes we want to iterate through entries that have been pushed since
+ * a certain event. We use sentinel entries on the queues to delimit these
+ * 'tick' events.
+ */
+static void queue_tick(struct queue *q)
+{
+ unsigned i;
+
+ for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+ list_del(q->sentinels + i);
+ list_add_tail(q->sentinels + i, q->qs + i);
+ }
+}
+
+typedef void (*iter_fn)(struct list_head *, void *);
+static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
+{
+ unsigned i;
+ struct list_head *h;
+
+ for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+ list_for_each_prev(h, q->qs + i) {
+ if (is_sentinel(q, h))
+ break;
+
+ fn(h, context);
+ }
+ }
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Describes a cache entry. Used in both the cache and the pre_cache.
+ */
+struct entry {
+ struct hlist_node hlist;
+ struct list_head list;
+ dm_oblock_t oblock;
+
+ /*
+ * FIXME: pack these better
+ */
+ bool dirty:1;
+ unsigned hit_count;
+};
+
+/*
+ * Rather than storing the cblock in an entry, we allocate all entries in
+ * an array, and infer the cblock from the entry position.
+ *
+ * Free entries are linked together into a list.
+ */
+struct entry_pool {
+ struct entry *entries, *entries_end;
+ struct list_head free;
+ unsigned nr_allocated;
+};
+
+static int epool_init(struct entry_pool *ep, unsigned nr_entries)
+{
+ unsigned i;
+
+ ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
+ if (!ep->entries)
+ return -ENOMEM;
+
+ ep->entries_end = ep->entries + nr_entries;
+
+ INIT_LIST_HEAD(&ep->free);
+ for (i = 0; i < nr_entries; i++)
+ list_add(&ep->entries[i].list, &ep->free);
+
+ ep->nr_allocated = 0;
+
+ return 0;
+}
+
+static void epool_exit(struct entry_pool *ep)
+{
+ vfree(ep->entries);
+}
+
+static struct entry *alloc_entry(struct entry_pool *ep)
+{
+ struct entry *e;
+
+ if (list_empty(&ep->free))
+ return NULL;
+
+ e = list_entry(list_pop(&ep->free), struct entry, list);
+ INIT_LIST_HEAD(&e->list);
+ INIT_HLIST_NODE(&e->hlist);
+ ep->nr_allocated++;
+
+ return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
+{
+ struct entry *e = ep->entries + from_cblock(cblock);
+
+ list_del_init(&e->list);
+ INIT_HLIST_NODE(&e->hlist);
+ ep->nr_allocated++;
+
+ return e;
+}
+
+static void free_entry(struct entry_pool *ep, struct entry *e)
+{
+ BUG_ON(!ep->nr_allocated);
+ ep->nr_allocated--;
+ INIT_HLIST_NODE(&e->hlist);
+ list_add(&e->list, &ep->free);
+}
+
+/*
+ * Returns NULL if the entry is free.
+ */
+static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
+{
+ struct entry *e = ep->entries + from_cblock(cblock);
+ return !hlist_unhashed(&e->hlist) ? e : NULL;
+}
+
+static bool epool_empty(struct entry_pool *ep)
+{
+ return list_empty(&ep->free);
+}
+
+static bool in_pool(struct entry_pool *ep, struct entry *e)
+{
+ return e >= ep->entries && e < ep->entries_end;
+}
+
+static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
+{
+ return to_cblock(e - ep->entries);
+}
+
+/*----------------------------------------------------------------*/
+
+struct mq_policy {
+ struct dm_cache_policy policy;
+
+ /* protects everything */
+ struct mutex lock;
+ dm_cblock_t cache_size;
+ struct io_tracker tracker;
+
+ /*
+ * Entries come from two pools, one of pre-cache entries, and one
+ * for the cache proper.
+ */
+ struct entry_pool pre_cache_pool;
+ struct entry_pool cache_pool;
+
+ /*
+ * We maintain three queues of entries. The cache proper,
+ * consisting of a clean and dirty queue, contains the currently
+ * active mappings. Whereas the pre_cache tracks blocks that
+ * are being hit frequently and potential candidates for promotion
+ * to the cache.
+ */
+ struct queue pre_cache;
+ struct queue cache_clean;
+ struct queue cache_dirty;
+
+ /*
+ * Keeps track of time, incremented by the core. We use this to
+ * avoid attributing multiple hits within the same tick.
+ *
+ * Access to tick_protected should be done with the spin lock held.
+ * It's copied to tick at the start of the map function (within the
+ * mutex).
+ */
+ spinlock_t tick_lock;
+ unsigned tick_protected;
+ unsigned tick;
+
+ /*
+ * A count of the number of times the map function has been called
+ * and found an entry in the pre_cache or cache. Currently used to
+ * calculate the generation.
+ */
+ unsigned hit_count;
+
+ /*
+ * A generation is a longish period that is used to trigger some
+ * book keeping effects. eg, decrementing hit counts on entries.
+ * This is needed to allow the cache to evolve as io patterns
+ * change.
+ */
+ unsigned generation;
+ unsigned generation_period; /* in lookups (will probably change) */
+
+ unsigned discard_promote_adjustment;
+ unsigned read_promote_adjustment;
+ unsigned write_promote_adjustment;
+
+ /*
+ * The hash table allows us to quickly find an entry by origin
+ * block. Both pre_cache and cache entries are in here.
+ */
+ unsigned nr_buckets;
+ dm_block_t hash_bits;
+ struct hlist_head *table;
+};
+
+#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
+#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
+#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
+#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Simple hash table implementation. Should replace with the standard hash
+ * table that's making its way upstream.
+ */
+static void hash_insert(struct mq_policy *mq, struct entry *e)
+{
+ unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
+
+ hlist_add_head(&e->hlist, mq->table + h);
+}
+
+static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
+{
+ unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
+ struct hlist_head *bucket = mq->table + h;
+ struct entry *e;
+
+ hlist_for_each_entry(e, bucket, hlist)
+ if (e->oblock == oblock) {
+ hlist_del(&e->hlist);
+ hlist_add_head(&e->hlist, bucket);
+ return e;
+ }
+
+ return NULL;
+}
+
+static void hash_remove(struct entry *e)
+{
+ hlist_del(&e->hlist);
+}
+
+/*----------------------------------------------------------------*/
+
+static bool any_free_cblocks(struct mq_policy *mq)
+{
+ return !epool_empty(&mq->cache_pool);
+}
+
+static bool any_clean_cblocks(struct mq_policy *mq)
+{
+ return !queue_empty(&mq->cache_clean);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Now we get to the meat of the policy. This section deals with deciding
+ * when to to add entries to the pre_cache and cache, and move between
+ * them.
+ */
+
+/*
+ * The queue level is based on the log2 of the hit count.
+ */
+static unsigned queue_level(struct entry *e)
+{
+ return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
+}
+
+static bool in_cache(struct mq_policy *mq, struct entry *e)
+{
+ return in_pool(&mq->cache_pool, e);
+}
+
+/*
+ * Inserts the entry into the pre_cache or the cache. Ensures the cache
+ * block is marked as allocated if necc. Inserts into the hash table.
+ * Sets the tick which records when the entry was last moved about.
+ */
+static void push(struct mq_policy *mq, struct entry *e)
+{
+ hash_insert(mq, e);
+
+ if (in_cache(mq, e))
+ queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
+ queue_level(e), &e->list);
+ else
+ queue_push(&mq->pre_cache, queue_level(e), &e->list);
+}
+
+/*
+ * Removes an entry from pre_cache or cache. Removes from the hash table.
+ */
+static void del(struct mq_policy *mq, struct entry *e)
+{
+ if (in_cache(mq, e))
+ queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
+ else
+ queue_remove(&mq->pre_cache, &e->list);
+
+ hash_remove(e);
+}
+
+/*
+ * Like del, except it removes the first entry in the queue (ie. the least
+ * recently used).
+ */
+static struct entry *pop(struct mq_policy *mq, struct queue *q)
+{
+ struct entry *e;
+ struct list_head *h = queue_pop(q);
+
+ if (!h)
+ return NULL;
+
+ e = container_of(h, struct entry, list);
+ hash_remove(e);
+
+ return e;
+}
+
+static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
+{
+ struct entry *e;
+ struct list_head *h = queue_pop_old(q);
+
+ if (!h)
+ return NULL;
+
+ e = container_of(h, struct entry, list);
+ hash_remove(e);
+
+ return e;
+}
+
+static struct entry *peek(struct queue *q)
+{
+ struct list_head *h = queue_peek(q);
+ return h ? container_of(h, struct entry, list) : NULL;
+}
+
+/*
+ * The promotion threshold is adjusted every generation. As are the counts
+ * of the entries.
+ *
+ * At the moment the threshold is taken by averaging the hit counts of some
+ * of the entries in the cache (the first 20 entries across all levels in
+ * ascending order, giving preference to the clean entries at each level).
+ *
+ * We can be much cleverer than this though. For example, each promotion
+ * could bump up the threshold helping to prevent churn. Much more to do
+ * here.
+ */
+
+#define MAX_TO_AVERAGE 20
+
+static void check_generation(struct mq_policy *mq)
+{
+ unsigned total = 0, nr = 0, count = 0, level;
+ struct list_head *head;
+ struct entry *e;
+
+ if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
+ mq->hit_count = 0;
+ mq->generation++;
+
+ for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
+ head = mq->cache_clean.qs + level;
+ list_for_each_entry(e, head, list) {
+ nr++;
+ total += e->hit_count;
+
+ if (++count >= MAX_TO_AVERAGE)
+ break;
+ }
+
+ head = mq->cache_dirty.qs + level;
+ list_for_each_entry(e, head, list) {
+ nr++;
+ total += e->hit_count;
+
+ if (++count >= MAX_TO_AVERAGE)
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * Whenever we use an entry we bump up it's hit counter, and push it to the
+ * back to it's current level.
+ */
+static void requeue(struct mq_policy *mq, struct entry *e)
+{
+ check_generation(mq);
+ del(mq, e);
+ push(mq, e);
+}
+
+/*
+ * Demote the least recently used entry from the cache to the pre_cache.
+ * Returns the new cache entry to use, and the old origin block it was
+ * mapped to.
+ *
+ * We drop the hit count on the demoted entry back to 1 to stop it bouncing
+ * straight back into the cache if it's subsequently hit. There are
+ * various options here, and more experimentation would be good:
+ *
+ * - just forget about the demoted entry completely (ie. don't insert it
+ into the pre_cache).
+ * - divide the hit count rather that setting to some hard coded value.
+ * - set the hit count to a hard coded value other than 1, eg, is it better
+ * if it goes in at level 2?
+ */
+static int demote_cblock(struct mq_policy *mq,
+ struct policy_locker *locker, dm_oblock_t *oblock)
+{
+ struct entry *demoted = peek(&mq->cache_clean);
+
+ if (!demoted)
+ /*
+ * We could get a block from mq->cache_dirty, but that
+ * would add extra latency to the triggering bio as it
+ * waits for the writeback. Better to not promote this
+ * time and hope there's a clean block next time this block
+ * is hit.
+ */
+ return -ENOSPC;
+
+ if (locker->fn(locker, demoted->oblock))
+ /*
+ * We couldn't lock the demoted block.
+ */
+ return -EBUSY;
+
+ del(mq, demoted);
+ *oblock = demoted->oblock;
+ free_entry(&mq->cache_pool, demoted);
+
+ /*
+ * We used to put the demoted block into the pre-cache, but I think
+ * it's simpler to just let it work it's way up from zero again.
+ * Stops blocks flickering in and out of the cache.
+ */
+
+ return 0;
+}
+
+/*
+ * Entries in the pre_cache whose hit count passes the promotion
+ * threshold move to the cache proper. Working out the correct
+ * value for the promotion_threshold is crucial to this policy.
+ */
+static unsigned promote_threshold(struct mq_policy *mq)
+{
+ struct entry *e;
+
+ if (any_free_cblocks(mq))
+ return 0;
+
+ e = peek(&mq->cache_clean);
+ if (e)
+ return e->hit_count;
+
+ e = peek(&mq->cache_dirty);
+ if (e)
+ return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
+
+ /* This should never happen */
+ return 0;
+}
+
+/*
+ * We modify the basic promotion_threshold depending on the specific io.
+ *
+ * If the origin block has been discarded then there's no cost to copy it
+ * to the cache.
+ *
+ * We bias towards reads, since they can be demoted at no cost if they
+ * haven't been dirtied.
+ */
+static unsigned adjusted_promote_threshold(struct mq_policy *mq,
+ bool discarded_oblock, int data_dir)
+{
+ if (data_dir == READ)
+ return promote_threshold(mq) + mq->read_promote_adjustment;
+
+ if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
+ /*
+ * We don't need to do any copying at all, so give this a
+ * very low threshold.
+ */
+ return mq->discard_promote_adjustment;
+ }
+
+ return promote_threshold(mq) + mq->write_promote_adjustment;
+}
+
+static bool should_promote(struct mq_policy *mq, struct entry *e,
+ bool discarded_oblock, int data_dir)
+{
+ return e->hit_count >=
+ adjusted_promote_threshold(mq, discarded_oblock, data_dir);
+}
+
+static int cache_entry_found(struct mq_policy *mq,
+ struct entry *e,
+ struct policy_result *result)
+{
+ requeue(mq, e);
+
+ if (in_cache(mq, e)) {
+ result->op = POLICY_HIT;
+ result->cblock = infer_cblock(&mq->cache_pool, e);
+ }
+
+ return 0;
+}
+
+/*
+ * Moves an entry from the pre_cache to the cache. The main work is
+ * finding which cache block to use.
+ */
+static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
+ struct policy_locker *locker,
+ struct policy_result *result)
+{
+ int r;
+ struct entry *new_e;
+
+ /* Ensure there's a free cblock in the cache */
+ if (epool_empty(&mq->cache_pool)) {
+ result->op = POLICY_REPLACE;
+ r = demote_cblock(mq, locker, &result->old_oblock);
+ if (r) {
+ result->op = POLICY_MISS;
+ return 0;
+ }
+
+ } else
+ result->op = POLICY_NEW;
+
+ new_e = alloc_entry(&mq->cache_pool);
+ BUG_ON(!new_e);
+
+ new_e->oblock = e->oblock;
+ new_e->dirty = false;
+ new_e->hit_count = e->hit_count;
+
+ del(mq, e);
+ free_entry(&mq->pre_cache_pool, e);
+ push(mq, new_e);
+
+ result->cblock = infer_cblock(&mq->cache_pool, new_e);
+
+ return 0;
+}
+
+static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
+ bool can_migrate, bool discarded_oblock,
+ int data_dir, struct policy_locker *locker,
+ struct policy_result *result)
+{
+ int r = 0;
+
+ if (!should_promote(mq, e, discarded_oblock, data_dir)) {
+ requeue(mq, e);
+ result->op = POLICY_MISS;
+
+ } else if (!can_migrate)
+ r = -EWOULDBLOCK;
+
+ else {
+ requeue(mq, e);
+ r = pre_cache_to_cache(mq, e, locker, result);
+ }
+
+ return r;
+}
+
+static void insert_in_pre_cache(struct mq_policy *mq,
+ dm_oblock_t oblock)
+{
+ struct entry *e = alloc_entry(&mq->pre_cache_pool);
+
+ if (!e)
+ /*
+ * There's no spare entry structure, so we grab the least
+ * used one from the pre_cache.
+ */
+ e = pop(mq, &mq->pre_cache);
+
+ if (unlikely(!e)) {
+ DMWARN("couldn't pop from pre cache");
+ return;
+ }
+
+ e->dirty = false;
+ e->oblock = oblock;
+ e->hit_count = 1;
+ push(mq, e);
+}
+
+static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
+ struct policy_locker *locker,
+ struct policy_result *result)
+{
+ int r;
+ struct entry *e;
+
+ if (epool_empty(&mq->cache_pool)) {
+ result->op = POLICY_REPLACE;
+ r = demote_cblock(mq, locker, &result->old_oblock);
+ if (unlikely(r)) {
+ result->op = POLICY_MISS;
+ insert_in_pre_cache(mq, oblock);
+ return;
+ }
+
+ /*
+ * This will always succeed, since we've just demoted.
+ */
+ e = alloc_entry(&mq->cache_pool);
+ BUG_ON(!e);
+
+ } else {
+ e = alloc_entry(&mq->cache_pool);
+ result->op = POLICY_NEW;
+ }
+
+ e->oblock = oblock;
+ e->dirty = false;
+ e->hit_count = 1;
+ push(mq, e);
+
+ result->cblock = infer_cblock(&mq->cache_pool, e);
+}
+
+static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
+ bool can_migrate, bool discarded_oblock,
+ int data_dir, struct policy_locker *locker,
+ struct policy_result *result)
+{
+ if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
+ if (can_migrate)
+ insert_in_cache(mq, oblock, locker, result);
+ else
+ return -EWOULDBLOCK;
+ } else {
+ insert_in_pre_cache(mq, oblock);
+ result->op = POLICY_MISS;
+ }
+
+ return 0;
+}
+
+/*
+ * Looks the oblock up in the hash table, then decides whether to put in
+ * pre_cache, or cache etc.
+ */
+static int map(struct mq_policy *mq, dm_oblock_t oblock,
+ bool can_migrate, bool discarded_oblock,
+ int data_dir, struct policy_locker *locker,
+ struct policy_result *result)
+{
+ int r = 0;
+ struct entry *e = hash_lookup(mq, oblock);
+
+ if (e && in_cache(mq, e))
+ r = cache_entry_found(mq, e, result);
+
+ else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
+ iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
+ result->op = POLICY_MISS;
+
+ else if (e)
+ r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
+ data_dir, locker, result);
+
+ else
+ r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
+ data_dir, locker, result);
+
+ if (r == -EWOULDBLOCK)
+ result->op = POLICY_MISS;
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Public interface, via the policy struct. See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
+{
+ return container_of(p, struct mq_policy, policy);
+}
+
+static void mq_destroy(struct dm_cache_policy *p)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ vfree(mq->table);
+ epool_exit(&mq->cache_pool);
+ epool_exit(&mq->pre_cache_pool);
+ kfree(mq);
+}
+
+static void update_pre_cache_hits(struct list_head *h, void *context)
+{
+ struct entry *e = container_of(h, struct entry, list);
+ e->hit_count++;
+}
+
+static void update_cache_hits(struct list_head *h, void *context)
+{
+ struct mq_policy *mq = context;
+ struct entry *e = container_of(h, struct entry, list);
+ e->hit_count++;
+ mq->hit_count++;
+}
+
+static void copy_tick(struct mq_policy *mq)
+{
+ unsigned long flags, tick;
+
+ spin_lock_irqsave(&mq->tick_lock, flags);
+ tick = mq->tick_protected;
+ if (tick != mq->tick) {
+ queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
+ queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
+ queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
+ mq->tick = tick;
+ }
+
+ queue_tick(&mq->pre_cache);
+ queue_tick(&mq->cache_dirty);
+ queue_tick(&mq->cache_clean);
+ queue_update_writeback_sentinels(&mq->cache_dirty);
+ spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool discarded_oblock,
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ result->op = POLICY_MISS;
+
+ if (can_block)
+ mutex_lock(&mq->lock);
+ else if (!mutex_trylock(&mq->lock))
+ return -EWOULDBLOCK;
+
+ copy_tick(mq);
+
+ iot_examine_bio(&mq->tracker, bio);
+ r = map(mq, oblock, can_migrate, discarded_oblock,
+ bio_data_dir(bio), locker, result);
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+ struct entry *e;
+
+ if (!mutex_trylock(&mq->lock))
+ return -EWOULDBLOCK;
+
+ e = hash_lookup(mq, oblock);
+ if (e && in_cache(mq, e)) {
+ *cblock = infer_cblock(&mq->cache_pool, e);
+ r = 0;
+ } else
+ r = -ENOENT;
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
+{
+ struct entry *e;
+
+ e = hash_lookup(mq, oblock);
+ BUG_ON(!e || !in_cache(mq, e));
+
+ del(mq, e);
+ e->dirty = set;
+ push(mq, e);
+}
+
+static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __mq_set_clear_dirty(mq, oblock, true);
+ mutex_unlock(&mq->lock);
+}
+
+static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __mq_set_clear_dirty(mq, oblock, false);
+ mutex_unlock(&mq->lock);
+}
+
+static int mq_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ uint32_t hint, bool hint_valid)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+ struct entry *e;
+
+ e = alloc_particular_entry(&mq->cache_pool, cblock);
+ e->oblock = oblock;
+ e->dirty = false; /* this gets corrected in a minute */
+ e->hit_count = hint_valid ? hint : 1;
+ push(mq, e);
+
+ return 0;
+}
+
+static int mq_save_hints(struct mq_policy *mq, struct queue *q,
+ policy_walk_fn fn, void *context)
+{
+ int r;
+ unsigned level;
+ struct list_head *h;
+ struct entry *e;
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ list_for_each(h, q->qs + level) {
+ if (is_sentinel(q, h))
+ continue;
+
+ e = container_of(h, struct entry, list);
+ r = fn(context, infer_cblock(&mq->cache_pool, e),
+ e->oblock, e->hit_count);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+ void *context)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+ int r = 0;
+
+ mutex_lock(&mq->lock);
+
+ r = mq_save_hints(mq, &mq->cache_clean, fn, context);
+ if (!r)
+ r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
+{
+ struct entry *e;
+
+ e = hash_lookup(mq, oblock);
+ BUG_ON(!e || !in_cache(mq, e));
+
+ del(mq, e);
+ free_entry(&mq->cache_pool, e);
+}
+
+static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __remove_mapping(mq, oblock);
+ mutex_unlock(&mq->lock);
+}
+
+static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
+{
+ struct entry *e = epool_find(&mq->cache_pool, cblock);
+
+ if (!e)
+ return -ENODATA;
+
+ del(mq, e);
+ free_entry(&mq->cache_pool, e);
+
+ return 0;
+}
+
+static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __remove_cblock(mq, cblock);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+#define CLEAN_TARGET_PERCENTAGE 25
+
+static bool clean_target_met(struct mq_policy *mq)
+{
+ /*
+ * Cache entries may not be populated. So we're cannot rely on the
+ * size of the clean queue.
+ */
+ unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
+ unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
+
+ return nr_clean >= target;
+}
+
+static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
+{
+ struct entry *e = pop_old(mq, &mq->cache_dirty);
+
+ if (!e && !clean_target_met(mq))
+ e = pop(mq, &mq->cache_dirty);
+
+ if (!e)
+ return -ENODATA;
+
+ *oblock = e->oblock;
+ *cblock = infer_cblock(&mq->cache_pool, e);
+ e->dirty = false;
+ push(mq, e);
+
+ return 0;
+}
+
+static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __mq_writeback_work(mq, oblock, cblock);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __force_mapping(struct mq_policy *mq,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct entry *e = hash_lookup(mq, current_oblock);
+
+ if (e && in_cache(mq, e)) {
+ del(mq, e);
+ e->oblock = new_oblock;
+ e->dirty = true;
+ push(mq, e);
+ }
+}
+
+static void mq_force_mapping(struct dm_cache_policy *p,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __force_mapping(mq, current_oblock, new_oblock);
+ mutex_unlock(&mq->lock);
+}
+
+static dm_cblock_t mq_residency(struct dm_cache_policy *p)
+{
+ dm_cblock_t r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = to_cblock(mq->cache_pool.nr_allocated);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void mq_tick(struct dm_cache_policy *p)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mq->tick_lock, flags);
+ mq->tick_protected++;
+ spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static int mq_set_config_value(struct dm_cache_policy *p,
+ const char *key, const char *value)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+ unsigned long tmp;
+
+ if (kstrtoul(value, 10, &tmp))
+ return -EINVAL;
+
+ if (!strcasecmp(key, "random_threshold")) {
+ mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
+
+ } else if (!strcasecmp(key, "sequential_threshold")) {
+ mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
+
+ } else if (!strcasecmp(key, "discard_promote_adjustment"))
+ mq->discard_promote_adjustment = tmp;
+
+ else if (!strcasecmp(key, "read_promote_adjustment"))
+ mq->read_promote_adjustment = tmp;
+
+ else if (!strcasecmp(key, "write_promote_adjustment"))
+ mq->write_promote_adjustment = tmp;
+
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+{
+ ssize_t sz = 0;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ DMEMIT("10 random_threshold %u "
+ "sequential_threshold %u "
+ "discard_promote_adjustment %u "
+ "read_promote_adjustment %u "
+ "write_promote_adjustment %u",
+ mq->tracker.thresholds[PATTERN_RANDOM],
+ mq->tracker.thresholds[PATTERN_SEQUENTIAL],
+ mq->discard_promote_adjustment,
+ mq->read_promote_adjustment,
+ mq->write_promote_adjustment);
+
+ return 0;
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct mq_policy *mq)
+{
+ mq->policy.destroy = mq_destroy;
+ mq->policy.map = mq_map;
+ mq->policy.lookup = mq_lookup;
+ mq->policy.set_dirty = mq_set_dirty;
+ mq->policy.clear_dirty = mq_clear_dirty;
+ mq->policy.load_mapping = mq_load_mapping;
+ mq->policy.walk_mappings = mq_walk_mappings;
+ mq->policy.remove_mapping = mq_remove_mapping;
+ mq->policy.remove_cblock = mq_remove_cblock;
+ mq->policy.writeback_work = mq_writeback_work;
+ mq->policy.force_mapping = mq_force_mapping;
+ mq->policy.residency = mq_residency;
+ mq->policy.tick = mq_tick;
+ mq->policy.emit_config_values = mq_emit_config_values;
+ mq->policy.set_config_value = mq_set_config_value;
+}
+
+static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
+
+ if (!mq)
+ return NULL;
+
+ init_policy_functions(mq);
+ iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
+ mq->cache_size = cache_size;
+
+ if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
+ DMERR("couldn't initialize pool of pre-cache entries");
+ goto bad_pre_cache_init;
+ }
+
+ if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
+ DMERR("couldn't initialize pool of cache entries");
+ goto bad_cache_init;
+ }
+
+ mq->tick_protected = 0;
+ mq->tick = 0;
+ mq->hit_count = 0;
+ mq->generation = 0;
+ mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
+ mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
+ mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
+ mutex_init(&mq->lock);
+ spin_lock_init(&mq->tick_lock);
+
+ queue_init(&mq->pre_cache);
+ queue_init(&mq->cache_clean);
+ queue_init(&mq->cache_dirty);
+
+ mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
+
+ mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
+ mq->hash_bits = ffs(mq->nr_buckets) - 1;
+ mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
+ if (!mq->table)
+ goto bad_alloc_table;
+
+ return &mq->policy;
+
+bad_alloc_table:
+ epool_exit(&mq->cache_pool);
+bad_cache_init:
+ epool_exit(&mq->pre_cache_pool);
+bad_pre_cache_init:
+ kfree(mq);
+
+ return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type mq_policy_type = {
+ .name = "mq",
+ .version = {1, 3, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = mq_create
+};
+
+static struct dm_cache_policy_type default_policy_type = {
+ .name = "default",
+ .version = {1, 3, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = mq_create,
+ .real = &mq_policy_type
+};
+
+static int __init mq_init(void)
+{
+ int r;
+
+ mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
+ sizeof(struct entry),
+ __alignof__(struct entry),
+ 0, NULL);
+ if (!mq_entry_cache)
+ goto bad;
+
+ r = dm_cache_policy_register(&mq_policy_type);
+ if (r) {
+ DMERR("register failed %d", r);
+ goto bad_register_mq;
+ }
+
+ r = dm_cache_policy_register(&default_policy_type);
+ if (!r) {
+ DMINFO("version %u.%u.%u loaded",
+ mq_policy_type.version[0],
+ mq_policy_type.version[1],
+ mq_policy_type.version[2]);
+ return 0;
+ }
+
+ DMERR("register failed (as default) %d", r);
+
+ dm_cache_policy_unregister(&mq_policy_type);
+bad_register_mq:
+ kmem_cache_destroy(mq_entry_cache);
+bad:
+ return -ENOMEM;
+}
+
+static void __exit mq_exit(void)
+{
+ dm_cache_policy_unregister(&mq_policy_type);
+ dm_cache_policy_unregister(&default_policy_type);
+
+ kmem_cache_destroy(mq_entry_cache);
+}
+
+module_init(mq_init);
+module_exit(mq_exit);
+
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("mq cache policy");
+
+MODULE_ALIAS("dm-cache-default");
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
new file mode 100644
index 000000000..c1a3cee99
--- /dev/null
+++ b/drivers/md/dm-cache-policy.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "cache-policy"
+
+static DEFINE_SPINLOCK(register_lock);
+static LIST_HEAD(register_list);
+
+static struct dm_cache_policy_type *__find_policy(const char *name)
+{
+ struct dm_cache_policy_type *t;
+
+ list_for_each_entry(t, &register_list, list)
+ if (!strcmp(t->name, name))
+ return t;
+
+ return NULL;
+}
+
+static struct dm_cache_policy_type *__get_policy_once(const char *name)
+{
+ struct dm_cache_policy_type *t = __find_policy(name);
+
+ if (t && !try_module_get(t->owner)) {
+ DMWARN("couldn't get module %s", name);
+ t = ERR_PTR(-EINVAL);
+ }
+
+ return t;
+}
+
+static struct dm_cache_policy_type *get_policy_once(const char *name)
+{
+ struct dm_cache_policy_type *t;
+
+ spin_lock(&register_lock);
+ t = __get_policy_once(name);
+ spin_unlock(&register_lock);
+
+ return t;
+}
+
+static struct dm_cache_policy_type *get_policy(const char *name)
+{
+ struct dm_cache_policy_type *t;
+
+ t = get_policy_once(name);
+ if (IS_ERR(t))
+ return NULL;
+
+ if (t)
+ return t;
+
+ request_module("dm-cache-%s", name);
+
+ t = get_policy_once(name);
+ if (IS_ERR(t))
+ return NULL;
+
+ return t;
+}
+
+static void put_policy(struct dm_cache_policy_type *t)
+{
+ module_put(t->owner);
+}
+
+int dm_cache_policy_register(struct dm_cache_policy_type *type)
+{
+ int r;
+
+ /* One size fits all for now */
+ if (type->hint_size != 0 && type->hint_size != 4) {
+ DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size);
+ return -EINVAL;
+ }
+
+ spin_lock(&register_lock);
+ if (__find_policy(type->name)) {
+ DMWARN("attempt to register policy under duplicate name %s", type->name);
+ r = -EINVAL;
+ } else {
+ list_add(&type->list, &register_list);
+ r = 0;
+ }
+ spin_unlock(&register_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_register);
+
+void dm_cache_policy_unregister(struct dm_cache_policy_type *type)
+{
+ spin_lock(&register_lock);
+ list_del_init(&type->list);
+ spin_unlock(&register_lock);
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_unregister);
+
+struct dm_cache_policy *dm_cache_policy_create(const char *name,
+ dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ struct dm_cache_policy *p = NULL;
+ struct dm_cache_policy_type *type;
+
+ type = get_policy(name);
+ if (!type) {
+ DMWARN("unknown policy type");
+ return ERR_PTR(-EINVAL);
+ }
+
+ p = type->create(cache_size, origin_size, cache_block_size);
+ if (!p) {
+ put_policy(type);
+ return ERR_PTR(-ENOMEM);
+ }
+ p->private = type;
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_create);
+
+void dm_cache_policy_destroy(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ p->destroy(p);
+ put_policy(t);
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_destroy);
+
+const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ /* if t->real is set then an alias was used (e.g. "default") */
+ if (t->real)
+ return t->real->name;
+
+ return t->name;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
+
+const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ return t->version;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_version);
+
+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ return t->hint_size;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
new file mode 100644
index 000000000..5524e21e4
--- /dev/null
+++ b/drivers/md/dm-cache-policy.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_POLICY_H
+#define DM_CACHE_POLICY_H
+
+#include "dm-cache-block-types.h"
+
+#include <linux/device-mapper.h>
+
+/*----------------------------------------------------------------*/
+
+/* FIXME: make it clear which methods are optional. Get debug policy to
+ * double check this at start.
+ */
+
+/*
+ * The cache policy makes the important decisions about which blocks get to
+ * live on the faster cache device.
+ *
+ * When the core target has to remap a bio it calls the 'map' method of the
+ * policy. This returns an instruction telling the core target what to do.
+ *
+ * POLICY_HIT:
+ * That block is in the cache. Remap to the cache and carry on.
+ *
+ * POLICY_MISS:
+ * This block is on the origin device. Remap and carry on.
+ *
+ * POLICY_NEW:
+ * This block is currently on the origin device, but the policy wants to
+ * move it. The core should:
+ *
+ * - hold any further io to this origin block
+ * - copy the origin to the given cache block
+ * - release all the held blocks
+ * - remap the original block to the cache
+ *
+ * POLICY_REPLACE:
+ * This block is currently on the origin device. The policy wants to
+ * move it to the cache, with the added complication that the destination
+ * cache block needs a writeback first. The core should:
+ *
+ * - hold any further io to this origin block
+ * - hold any further io to the origin block that's being written back
+ * - writeback
+ * - copy new block to cache
+ * - release held blocks
+ * - remap bio to cache and reissue.
+ *
+ * Should the core run into trouble while processing a POLICY_NEW or
+ * POLICY_REPLACE instruction it will roll back the policies mapping using
+ * remove_mapping() or force_mapping(). These methods must not fail. This
+ * approach avoids having transactional semantics in the policy (ie, the
+ * core informing the policy when a migration is complete), and hence makes
+ * it easier to write new policies.
+ *
+ * In general policy methods should never block, except in the case of the
+ * map function when can_migrate is set. So be careful to implement using
+ * bounded, preallocated memory.
+ */
+enum policy_operation {
+ POLICY_HIT,
+ POLICY_MISS,
+ POLICY_NEW,
+ POLICY_REPLACE
+};
+
+/*
+ * When issuing a POLICY_REPLACE the policy needs to make a callback to
+ * lock the block being demoted. This doesn't need to occur during a
+ * writeback operation since the block remains in the cache.
+ */
+struct policy_locker;
+typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
+
+struct policy_locker {
+ policy_lock_fn fn;
+};
+
+/*
+ * This is the instruction passed back to the core target.
+ */
+struct policy_result {
+ enum policy_operation op;
+ dm_oblock_t old_oblock; /* POLICY_REPLACE */
+ dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+};
+
+typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
+ dm_oblock_t oblock, uint32_t hint);
+
+/*
+ * The cache policy object. Just a bunch of methods. It is envisaged that
+ * this structure will be embedded in a bigger, policy specific structure
+ * (ie. use container_of()).
+ */
+struct dm_cache_policy {
+
+ /*
+ * FIXME: make it clear which methods are optional, and which may
+ * block.
+ */
+
+ /*
+ * Destroys this object.
+ */
+ void (*destroy)(struct dm_cache_policy *p);
+
+ /*
+ * See large comment above.
+ *
+ * oblock - the origin block we're interested in.
+ *
+ * can_block - indicates whether the current thread is allowed to
+ * block. -EWOULDBLOCK returned if it can't and would.
+ *
+ * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
+ * instructions. If denied and the policy would have
+ * returned one of these instructions it should
+ * return -EWOULDBLOCK.
+ *
+ * discarded_oblock - indicates whether the whole origin block is
+ * in a discarded state (FIXME: better to tell the
+ * policy about this sooner, so it can recycle that
+ * cache block if it wants.)
+ * bio - the bio that triggered this call.
+ * result - gets filled in with the instruction.
+ *
+ * May only return 0, or -EWOULDBLOCK (if !can_migrate)
+ */
+ int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool discarded_oblock,
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result);
+
+ /*
+ * Sometimes we want to see if a block is in the cache, without
+ * triggering any update of stats. (ie. it's not a real hit).
+ *
+ * Must not block.
+ *
+ * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
+ * (-EWOULDBLOCK would be typical).
+ */
+ int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
+
+ void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+ void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+
+ /*
+ * Called when a cache target is first created. Used to load a
+ * mapping from the metadata device into the policy.
+ */
+ int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
+ dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+
+ int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
+ void *context);
+
+ /*
+ * Override functions used on the error paths of the core target.
+ * They must succeed.
+ */
+ void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
+ void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
+ dm_oblock_t new_oblock);
+
+ /*
+ * This is called via the invalidate_cblocks message. It is
+ * possible the particular cblock has already been removed due to a
+ * write io in passthrough mode. In which case this should return
+ * -ENODATA.
+ */
+ int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
+
+ /*
+ * Provide a dirty block to be written back by the core target.
+ *
+ * Returns:
+ *
+ * 0 and @cblock,@oblock: block to write back provided
+ *
+ * -ENODATA: no dirty blocks available
+ */
+ int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+
+ /*
+ * How full is the cache?
+ */
+ dm_cblock_t (*residency)(struct dm_cache_policy *p);
+
+ /*
+ * Because of where we sit in the block layer, we can be asked to
+ * map a lot of little bios that are all in the same block (no
+ * queue merging has occurred). To stop the policy being fooled by
+ * these the core target sends regular tick() calls to the policy.
+ * The policy should only count an entry as hit once per tick.
+ */
+ void (*tick)(struct dm_cache_policy *p);
+
+ /*
+ * Configuration.
+ */
+ int (*emit_config_values)(struct dm_cache_policy *p,
+ char *result, unsigned maxlen);
+ int (*set_config_value)(struct dm_cache_policy *p,
+ const char *key, const char *value);
+
+ /*
+ * Book keeping ptr for the policy register, not for general use.
+ */
+ void *private;
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * We maintain a little register of the different policy types.
+ */
+#define CACHE_POLICY_NAME_SIZE 16
+#define CACHE_POLICY_VERSION_SIZE 3
+
+struct dm_cache_policy_type {
+ /* For use by the register code only. */
+ struct list_head list;
+
+ /*
+ * Policy writers should fill in these fields. The name field is
+ * what gets passed on the target line to select your policy.
+ */
+ char name[CACHE_POLICY_NAME_SIZE];
+ unsigned version[CACHE_POLICY_VERSION_SIZE];
+
+ /*
+ * For use by an alias dm_cache_policy_type to point to the
+ * real dm_cache_policy_type.
+ */
+ struct dm_cache_policy_type *real;
+
+ /*
+ * Policies may store a hint for each each cache block.
+ * Currently the size of this hint must be 0 or 4 bytes but we
+ * expect to relax this in future.
+ */
+ size_t hint_size;
+
+ struct module *owner;
+ struct dm_cache_policy *(*create)(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t block_size);
+};
+
+int dm_cache_policy_register(struct dm_cache_policy_type *type);
+void dm_cache_policy_unregister(struct dm_cache_policy_type *type);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_CACHE_POLICY_H */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
new file mode 100644
index 000000000..e049becaa
--- /dev/null
+++ b/drivers/md/dm-cache-target.c
@@ -0,0 +1,3407 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison.h"
+#include "dm-bio-record.h"
+#include "dm-cache-metadata.h"
+
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "cache"
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
+ "A percentage of time allocated for copying to and/or from cache");
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ * either direction
+ */
+
+/*----------------------------------------------------------------*/
+
+static size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+ return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+
+static unsigned long *alloc_bitset(unsigned nr_entries)
+{
+ size_t s = bitset_size_in_bytes(nr_entries);
+ return vzalloc(s);
+}
+
+static void clear_bitset(void *bitset, unsigned nr_entries)
+{
+ size_t s = bitset_size_in_bytes(nr_entries);
+ memset(bitset, 0, s);
+}
+
+static void free_bitset(unsigned long *bits)
+{
+ vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * There are a couple of places where we let a bio run, but want to do some
+ * work before calling its endio function. We do this by temporarily
+ * changing the endio fn.
+ */
+struct dm_hook_info {
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+};
+
+static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
+ bio_end_io_t *bi_end_io, void *bi_private)
+{
+ h->bi_end_io = bio->bi_end_io;
+ h->bi_private = bio->bi_private;
+
+ bio->bi_end_io = bi_end_io;
+ bio->bi_private = bi_private;
+}
+
+static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
+{
+ bio->bi_end_io = h->bi_end_io;
+ bio->bi_private = h->bi_private;
+
+ /*
+ * Must bump bi_remaining to allow bio to complete with
+ * restored bi_end_io.
+ */
+ atomic_inc(&bio->bi_remaining);
+}
+
+/*----------------------------------------------------------------*/
+
+#define MIGRATION_POOL_SIZE 128
+#define COMMIT_PERIOD HZ
+#define MIGRATION_COUNT_WINDOW 10
+
+/*
+ * The block size of the device holding cache data must be
+ * between 32KB and 1GB.
+ */
+#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
+#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
+
+/*
+ * FIXME: the cache is read/write for the time being.
+ */
+enum cache_metadata_mode {
+ CM_WRITE, /* metadata may be changed */
+ CM_READ_ONLY, /* metadata may not be changed */
+};
+
+enum cache_io_mode {
+ /*
+ * Data is written to cached blocks only. These blocks are marked
+ * dirty. If you lose the cache device you will lose data.
+ * Potential performance increase for both reads and writes.
+ */
+ CM_IO_WRITEBACK,
+
+ /*
+ * Data is written to both cache and origin. Blocks are never
+ * dirty. Potential performance benfit for reads only.
+ */
+ CM_IO_WRITETHROUGH,
+
+ /*
+ * A degraded mode useful for various cache coherency situations
+ * (eg, rolling back snapshots). Reads and writes always go to the
+ * origin. If a write goes to a cached oblock, then the cache
+ * block is invalidated.
+ */
+ CM_IO_PASSTHROUGH
+};
+
+struct cache_features {
+ enum cache_metadata_mode mode;
+ enum cache_io_mode io_mode;
+};
+
+struct cache_stats {
+ atomic_t read_hit;
+ atomic_t read_miss;
+ atomic_t write_hit;
+ atomic_t write_miss;
+ atomic_t demotion;
+ atomic_t promotion;
+ atomic_t copies_avoided;
+ atomic_t cache_cell_clash;
+ atomic_t commit_count;
+ atomic_t discard_count;
+};
+
+/*
+ * Defines a range of cblocks, begin to (end - 1) are in the range. end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+ dm_cblock_t begin;
+ dm_cblock_t end;
+};
+
+struct invalidation_request {
+ struct list_head list;
+ struct cblock_range *cblocks;
+
+ atomic_t complete;
+ int err;
+
+ wait_queue_head_t result_wait;
+};
+
+struct cache {
+ struct dm_target *ti;
+ struct dm_target_callbacks callbacks;
+
+ struct dm_cache_metadata *cmd;
+
+ /*
+ * Metadata is written to this device.
+ */
+ struct dm_dev *metadata_dev;
+
+ /*
+ * The slower of the two data devices. Typically a spindle.
+ */
+ struct dm_dev *origin_dev;
+
+ /*
+ * The faster of the two data devices. Typically an SSD.
+ */
+ struct dm_dev *cache_dev;
+
+ /*
+ * Size of the origin device in _complete_ blocks and native sectors.
+ */
+ dm_oblock_t origin_blocks;
+ sector_t origin_sectors;
+
+ /*
+ * Size of the cache device in blocks.
+ */
+ dm_cblock_t cache_size;
+
+ /*
+ * Fields for converting from sectors to blocks.
+ */
+ uint32_t sectors_per_block;
+ int sectors_per_block_shift;
+
+ spinlock_t lock;
+ struct bio_list deferred_bios;
+ struct bio_list deferred_flush_bios;
+ struct bio_list deferred_writethrough_bios;
+ struct list_head quiesced_migrations;
+ struct list_head completed_migrations;
+ struct list_head need_commit_migrations;
+ sector_t migration_threshold;
+ wait_queue_head_t migration_wait;
+ atomic_t nr_allocated_migrations;
+
+ /*
+ * The number of in flight migrations that are performing
+ * background io. eg, promotion, writeback.
+ */
+ atomic_t nr_io_migrations;
+
+ wait_queue_head_t quiescing_wait;
+ atomic_t quiescing;
+ atomic_t quiescing_ack;
+
+ /*
+ * cache_size entries, dirty if set
+ */
+ atomic_t nr_dirty;
+ unsigned long *dirty_bitset;
+
+ /*
+ * origin_blocks entries, discarded if set.
+ */
+ dm_dblock_t discard_nr_blocks;
+ unsigned long *discard_bitset;
+ uint32_t discard_block_size; /* a power of 2 times sectors per block */
+
+ /*
+ * Rather than reconstructing the table line for the status we just
+ * save it and regurgitate.
+ */
+ unsigned nr_ctr_args;
+ const char **ctr_args;
+
+ struct dm_kcopyd_client *copier;
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ struct delayed_work waker;
+ unsigned long last_commit_jiffies;
+
+ struct dm_bio_prison *prison;
+ struct dm_deferred_set *all_io_ds;
+
+ mempool_t *migration_pool;
+
+ struct dm_cache_policy *policy;
+ unsigned policy_nr_args;
+
+ bool need_tick_bio:1;
+ bool sized:1;
+ bool invalidate:1;
+ bool commit_requested:1;
+ bool loaded_mappings:1;
+ bool loaded_discards:1;
+
+ /*
+ * Cache features such as write-through.
+ */
+ struct cache_features features;
+
+ struct cache_stats stats;
+
+ /*
+ * Invalidation fields.
+ */
+ spinlock_t invalidation_lock;
+ struct list_head invalidation_requests;
+};
+
+struct per_bio_data {
+ bool tick:1;
+ unsigned req_nr:2;
+ struct dm_deferred_entry *all_io_entry;
+ struct dm_hook_info hook_info;
+
+ /*
+ * writethrough fields. These MUST remain at the end of this
+ * structure and the 'cache' member must be the first as it
+ * is used to determine the offset of the writethrough fields.
+ */
+ struct cache *cache;
+ dm_cblock_t cblock;
+ struct dm_bio_details bio_details;
+};
+
+struct dm_cache_migration {
+ struct list_head list;
+ struct cache *cache;
+
+ unsigned long start_jiffies;
+ dm_oblock_t old_oblock;
+ dm_oblock_t new_oblock;
+ dm_cblock_t cblock;
+
+ bool err:1;
+ bool discard:1;
+ bool writeback:1;
+ bool demote:1;
+ bool promote:1;
+ bool requeue_holder:1;
+ bool invalidate:1;
+
+ struct dm_bio_prison_cell *old_ocell;
+ struct dm_bio_prison_cell *new_ocell;
+};
+
+/*
+ * Processing a bio in the worker thread may require these memory
+ * allocations. We prealloc to avoid deadlocks (the same worker thread
+ * frees them back to the mempool).
+ */
+struct prealloc {
+ struct dm_cache_migration *mg;
+ struct dm_bio_prison_cell *cell1;
+ struct dm_bio_prison_cell *cell2;
+};
+
+static void wake_worker(struct cache *cache)
+{
+ queue_work(cache->wq, &cache->worker);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+{
+ /* FIXME: change to use a local slab. */
+ return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+}
+
+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+ dm_bio_prison_free_cell(cache->prison, cell);
+}
+
+static struct dm_cache_migration *alloc_migration(struct cache *cache)
+{
+ struct dm_cache_migration *mg;
+
+ mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+ if (mg) {
+ mg->cache = cache;
+ atomic_inc(&mg->cache->nr_allocated_migrations);
+ }
+
+ return mg;
+}
+
+static void free_migration(struct dm_cache_migration *mg)
+{
+ if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
+ wake_up(&mg->cache->migration_wait);
+
+ mempool_free(mg, mg->cache->migration_pool);
+}
+
+static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
+{
+ if (!p->mg) {
+ p->mg = alloc_migration(cache);
+ if (!p->mg)
+ return -ENOMEM;
+ }
+
+ if (!p->cell1) {
+ p->cell1 = alloc_prison_cell(cache);
+ if (!p->cell1)
+ return -ENOMEM;
+ }
+
+ if (!p->cell2) {
+ p->cell2 = alloc_prison_cell(cache);
+ if (!p->cell2)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+{
+ if (p->cell2)
+ free_prison_cell(cache, p->cell2);
+
+ if (p->cell1)
+ free_prison_cell(cache, p->cell1);
+
+ if (p->mg)
+ free_migration(p->mg);
+}
+
+static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+{
+ struct dm_cache_migration *mg = p->mg;
+
+ BUG_ON(!mg);
+ p->mg = NULL;
+
+ return mg;
+}
+
+/*
+ * You must have a cell within the prealloc struct to return. If not this
+ * function will BUG() rather than returning NULL.
+ */
+static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+{
+ struct dm_bio_prison_cell *r = NULL;
+
+ if (p->cell1) {
+ r = p->cell1;
+ p->cell1 = NULL;
+
+ } else if (p->cell2) {
+ r = p->cell2;
+ p->cell2 = NULL;
+ } else
+ BUG();
+
+ return r;
+}
+
+/*
+ * You can't have more than two cells in a prealloc struct. BUG() will be
+ * called if you try and overfill.
+ */
+static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
+{
+ if (!p->cell2)
+ p->cell2 = cell;
+
+ else if (!p->cell1)
+ p->cell1 = cell;
+
+ else
+ BUG();
+}
+
+/*----------------------------------------------------------------*/
+
+static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
+{
+ key->virtual = 0;
+ key->dev = 0;
+ key->block_begin = from_oblock(begin);
+ key->block_end = from_oblock(end);
+}
+
+/*
+ * The caller hands in a preallocated cell, and a free function for it.
+ * The cell will be freed if there's an error, or if it wasn't used because
+ * a cell with that key already exists.
+ */
+typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+
+static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
+ struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+ cell_free_fn free_fn, void *free_context,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ struct dm_cell_key key;
+
+ build_key(oblock_begin, oblock_end, &key);
+ r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
+ if (r)
+ free_fn(free_context, cell_prealloc);
+
+ return r;
+}
+
+static int bio_detain(struct cache *cache, dm_oblock_t oblock,
+ struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+ cell_free_fn free_fn, void *free_context,
+ struct dm_bio_prison_cell **cell_result)
+{
+ dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
+ return bio_detain_range(cache, oblock, end, bio,
+ cell_prealloc, free_fn, free_context, cell_result);
+}
+
+static int get_cell(struct cache *cache,
+ dm_oblock_t oblock,
+ struct prealloc *structs,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ struct dm_cell_key key;
+ struct dm_bio_prison_cell *cell_prealloc;
+
+ cell_prealloc = prealloc_get_cell(structs);
+
+ build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
+ r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
+ if (r)
+ prealloc_put_cell(structs, cell_prealloc);
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static bool is_dirty(struct cache *cache, dm_cblock_t b)
+{
+ return test_bit(from_cblock(b), cache->dirty_bitset);
+}
+
+static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
+ atomic_inc(&cache->nr_dirty);
+ policy_set_dirty(cache->policy, oblock);
+ }
+}
+
+static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
+ policy_clear_dirty(cache->policy, oblock);
+ if (atomic_dec_return(&cache->nr_dirty) == 0)
+ dm_table_event(cache->ti->table);
+ }
+}
+
+/*----------------------------------------------------------------*/
+
+static bool block_size_is_power_of_two(struct cache *cache)
+{
+ return cache->sectors_per_block_shift >= 0;
+}
+
+/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
+#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
+__always_inline
+#endif
+static dm_block_t block_div(dm_block_t b, uint32_t n)
+{
+ do_div(b, n);
+
+ return b;
+}
+
+static dm_block_t oblocks_per_dblock(struct cache *cache)
+{
+ dm_block_t oblocks = cache->discard_block_size;
+
+ if (block_size_is_power_of_two(cache))
+ oblocks >>= cache->sectors_per_block_shift;
+ else
+ oblocks = block_div(oblocks, cache->sectors_per_block);
+
+ return oblocks;
+}
+
+static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
+{
+ return to_dblock(block_div(from_oblock(oblock),
+ oblocks_per_dblock(cache)));
+}
+
+static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
+{
+ return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
+}
+
+static void set_discard(struct cache *cache, dm_dblock_t b)
+{
+ unsigned long flags;
+
+ BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
+ atomic_inc(&cache->stats.discard_count);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ set_bit(from_dblock(b), cache->discard_bitset);
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void clear_discard(struct cache *cache, dm_dblock_t b)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ clear_bit(from_dblock(b), cache->discard_bitset);
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static bool is_discarded(struct cache *cache, dm_dblock_t b)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ r = test_bit(from_dblock(b), cache->discard_bitset);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ return r;
+}
+
+static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
+ cache->discard_bitset);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static void load_stats(struct cache *cache)
+{
+ struct dm_cache_statistics stats;
+
+ dm_cache_metadata_get_stats(cache->cmd, &stats);
+ atomic_set(&cache->stats.read_hit, stats.read_hits);
+ atomic_set(&cache->stats.read_miss, stats.read_misses);
+ atomic_set(&cache->stats.write_hit, stats.write_hits);
+ atomic_set(&cache->stats.write_miss, stats.write_misses);
+}
+
+static void save_stats(struct cache *cache)
+{
+ struct dm_cache_statistics stats;
+
+ stats.read_hits = atomic_read(&cache->stats.read_hit);
+ stats.read_misses = atomic_read(&cache->stats.read_miss);
+ stats.write_hits = atomic_read(&cache->stats.write_hit);
+ stats.write_misses = atomic_read(&cache->stats.write_miss);
+
+ dm_cache_metadata_set_stats(cache->cmd, &stats);
+}
+
+/*----------------------------------------------------------------
+ * Per bio data
+ *--------------------------------------------------------------*/
+
+/*
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
+ */
+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
+
+static bool writethrough_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITETHROUGH;
+}
+
+static bool writeback_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITEBACK;
+}
+
+static bool passthrough_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_PASSTHROUGH;
+}
+
+static size_t get_per_bio_data_size(struct cache *cache)
+{
+ return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+}
+
+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
+{
+ struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
+ BUG_ON(!pb);
+ return pb;
+}
+
+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, data_size);
+
+ pb->tick = false;
+ pb->req_nr = dm_bio_get_target_bio_nr(bio);
+ pb->all_io_entry = NULL;
+
+ return pb;
+}
+
+/*----------------------------------------------------------------
+ * Remapping
+ *--------------------------------------------------------------*/
+static void remap_to_origin(struct cache *cache, struct bio *bio)
+{
+ bio->bi_bdev = cache->origin_dev->bdev;
+}
+
+static void remap_to_cache(struct cache *cache, struct bio *bio,
+ dm_cblock_t cblock)
+{
+ sector_t bi_sector = bio->bi_iter.bi_sector;
+ sector_t block = from_cblock(cblock);
+
+ bio->bi_bdev = cache->cache_dev->bdev;
+ if (!block_size_is_power_of_two(cache))
+ bio->bi_iter.bi_sector =
+ (block * cache->sectors_per_block) +
+ sector_div(bi_sector, cache->sectors_per_block);
+ else
+ bio->bi_iter.bi_sector =
+ (block << cache->sectors_per_block_shift) |
+ (bi_sector & (cache->sectors_per_block - 1));
+}
+
+static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ if (cache->need_tick_bio &&
+ !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
+ pb->tick = true;
+ cache->need_tick_bio = false;
+ }
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock)
+{
+ check_if_tick_bio_needed(cache, bio);
+ remap_to_origin(cache, bio);
+ if (bio_data_dir(bio) == WRITE)
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
+}
+
+static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ check_if_tick_bio_needed(cache, bio);
+ remap_to_cache(cache, bio, cblock);
+ if (bio_data_dir(bio) == WRITE) {
+ set_dirty(cache, oblock, cblock);
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
+ }
+}
+
+static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
+{
+ sector_t block_nr = bio->bi_iter.bi_sector;
+
+ if (!block_size_is_power_of_two(cache))
+ (void) sector_div(block_nr, cache->sectors_per_block);
+ else
+ block_nr >>= cache->sectors_per_block_shift;
+
+ return to_oblock(block_nr);
+}
+
+static int bio_triggers_commit(struct cache *cache, struct bio *bio)
+{
+ return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+}
+
+/*
+ * You must increment the deferred set whilst the prison cell is held. To
+ * encourage this, we ask for 'cell' to be passed in.
+ */
+static void inc_ds(struct cache *cache, struct bio *bio,
+ struct dm_bio_prison_cell *cell)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ BUG_ON(!cell);
+ BUG_ON(pb->all_io_entry);
+
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+}
+
+static void issue(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+
+ if (!bio_triggers_commit(cache, bio)) {
+ generic_make_request(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a
+ * single commit for them in do_worker().
+ */
+ spin_lock_irqsave(&cache->lock, flags);
+ cache->commit_requested = true;
+ bio_list_add(&cache->deferred_flush_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
+{
+ inc_ds(cache, bio, cell);
+ issue(cache, bio);
+}
+
+static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_add(&cache->deferred_writethrough_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void writethrough_endio(struct bio *bio, int err)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+
+ dm_unhook_bio(&pb->hook_info, bio);
+
+ if (err) {
+ bio_endio(bio, err);
+ return;
+ }
+
+ dm_bio_restore(&pb->bio_details, bio);
+ remap_to_cache(pb->cache, bio, pb->cblock);
+
+ /*
+ * We can't issue this bio directly, since we're in interrupt
+ * context. So it gets put on a bio list for processing by the
+ * worker thread.
+ */
+ defer_writethrough_bio(pb->cache, bio);
+}
+
+/*
+ * When running in writethrough mode we need to send writes to clean blocks
+ * to both the cache and origin devices. In future we'd like to clone the
+ * bio and send them in parallel, but for now we're doing them in
+ * series as this is easier.
+ */
+static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+
+ pb->cache = cache;
+ pb->cblock = cblock;
+ dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
+ dm_bio_record(&pb->bio_details, bio);
+
+ remap_to_origin_clear_discard(pb->cache, bio, oblock);
+}
+
+/*----------------------------------------------------------------
+ * Migration processing
+ *
+ * Migration covers moving data from the origin device to the cache, or
+ * vice versa.
+ *--------------------------------------------------------------*/
+static void inc_io_migrations(struct cache *cache)
+{
+ atomic_inc(&cache->nr_io_migrations);
+}
+
+static void dec_io_migrations(struct cache *cache)
+{
+ atomic_dec(&cache->nr_io_migrations);
+}
+
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
+ bool holder)
+{
+ (holder ? dm_cell_release : dm_cell_release_no_holder)
+ (cache->prison, cell, &cache->deferred_bios);
+ free_prison_cell(cache, cell);
+}
+
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
+ bool holder)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ __cell_defer(cache, cell, holder);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void free_io_migration(struct dm_cache_migration *mg)
+{
+ dec_io_migrations(mg->cache);
+ free_migration(mg);
+}
+
+static void migration_failure(struct dm_cache_migration *mg)
+{
+ struct cache *cache = mg->cache;
+
+ if (mg->writeback) {
+ DMWARN_LIMIT("writeback failed; couldn't copy block");
+ set_dirty(cache, mg->old_oblock, mg->cblock);
+ cell_defer(cache, mg->old_ocell, false);
+
+ } else if (mg->demote) {
+ DMWARN_LIMIT("demotion failed; couldn't copy block");
+ policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
+
+ cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
+ if (mg->promote)
+ cell_defer(cache, mg->new_ocell, true);
+ } else {
+ DMWARN_LIMIT("promotion failed; couldn't copy block");
+ policy_remove_mapping(cache->policy, mg->new_oblock);
+ cell_defer(cache, mg->new_ocell, true);
+ }
+
+ free_io_migration(mg);
+}
+
+static void migration_success_pre_commit(struct dm_cache_migration *mg)
+{
+ unsigned long flags;
+ struct cache *cache = mg->cache;
+
+ if (mg->writeback) {
+ clear_dirty(cache, mg->old_oblock, mg->cblock);
+ cell_defer(cache, mg->old_ocell, false);
+ free_io_migration(mg);
+ return;
+
+ } else if (mg->demote) {
+ if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
+ DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+ policy_force_mapping(cache->policy, mg->new_oblock,
+ mg->old_oblock);
+ if (mg->promote)
+ cell_defer(cache, mg->new_ocell, true);
+ free_io_migration(mg);
+ return;
+ }
+ } else {
+ if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
+ DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+ policy_remove_mapping(cache->policy, mg->new_oblock);
+ free_io_migration(mg);
+ return;
+ }
+ }
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->need_commit_migrations);
+ cache->commit_requested = true;
+ spin_unlock_irqrestore(&cache->lock, flags);
+}
+
+static void migration_success_post_commit(struct dm_cache_migration *mg)
+{
+ unsigned long flags;
+ struct cache *cache = mg->cache;
+
+ if (mg->writeback) {
+ DMWARN("writeback unexpectedly triggered commit");
+ return;
+
+ } else if (mg->demote) {
+ cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
+
+ if (mg->promote) {
+ mg->demote = false;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->quiesced_migrations);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ } else {
+ if (mg->invalidate)
+ policy_remove_mapping(cache->policy, mg->old_oblock);
+ free_io_migration(mg);
+ }
+
+ } else {
+ if (mg->requeue_holder) {
+ clear_dirty(cache, mg->new_oblock, mg->cblock);
+ cell_defer(cache, mg->new_ocell, true);
+ } else {
+ /*
+ * The block was promoted via an overwrite, so it's dirty.
+ */
+ set_dirty(cache, mg->new_oblock, mg->cblock);
+ bio_endio(mg->new_ocell->holder, 0);
+ cell_defer(cache, mg->new_ocell, false);
+ }
+ free_io_migration(mg);
+ }
+}
+
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+ unsigned long flags;
+ struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
+ struct cache *cache = mg->cache;
+
+ if (read_err || write_err)
+ mg->err = true;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->completed_migrations);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void issue_copy(struct dm_cache_migration *mg)
+{
+ int r;
+ struct dm_io_region o_region, c_region;
+ struct cache *cache = mg->cache;
+ sector_t cblock = from_cblock(mg->cblock);
+
+ o_region.bdev = cache->origin_dev->bdev;
+ o_region.count = cache->sectors_per_block;
+
+ c_region.bdev = cache->cache_dev->bdev;
+ c_region.sector = cblock * cache->sectors_per_block;
+ c_region.count = cache->sectors_per_block;
+
+ if (mg->writeback || mg->demote) {
+ /* demote */
+ o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
+ r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
+ } else {
+ /* promote */
+ o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
+ r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
+ }
+
+ if (r < 0) {
+ DMERR_LIMIT("issuing migration failed");
+ migration_failure(mg);
+ }
+}
+
+static void overwrite_endio(struct bio *bio, int err)
+{
+ struct dm_cache_migration *mg = bio->bi_private;
+ struct cache *cache = mg->cache;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ unsigned long flags;
+
+ dm_unhook_bio(&pb->hook_info, bio);
+
+ if (err)
+ mg->err = true;
+
+ mg->requeue_holder = false;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->completed_migrations);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(mg->cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
+ remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+
+ /*
+ * No need to inc_ds() here, since the cell will be held for the
+ * duration of the io.
+ */
+ generic_make_request(bio);
+}
+
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+}
+
+static void avoid_copy(struct dm_cache_migration *mg)
+{
+ atomic_inc(&mg->cache->stats.copies_avoided);
+ migration_success_pre_commit(mg);
+}
+
+static void calc_discard_block_range(struct cache *cache, struct bio *bio,
+ dm_dblock_t *b, dm_dblock_t *e)
+{
+ sector_t sb = bio->bi_iter.bi_sector;
+ sector_t se = bio_end_sector(bio);
+
+ *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
+
+ if (se - sb < cache->discard_block_size)
+ *e = *b;
+ else
+ *e = to_dblock(block_div(se, cache->discard_block_size));
+}
+
+static void issue_discard(struct dm_cache_migration *mg)
+{
+ dm_dblock_t b, e;
+ struct bio *bio = mg->new_ocell->holder;
+
+ calc_discard_block_range(mg->cache, bio, &b, &e);
+ while (b != e) {
+ set_discard(mg->cache, b);
+ b = to_dblock(from_dblock(b) + 1);
+ }
+
+ bio_endio(bio, 0);
+ cell_defer(mg->cache, mg->new_ocell, false);
+ free_migration(mg);
+}
+
+static void issue_copy_or_discard(struct dm_cache_migration *mg)
+{
+ bool avoid;
+ struct cache *cache = mg->cache;
+
+ if (mg->discard) {
+ issue_discard(mg);
+ return;
+ }
+
+ if (mg->writeback || mg->demote)
+ avoid = !is_dirty(cache, mg->cblock) ||
+ is_discarded_oblock(cache, mg->old_oblock);
+ else {
+ struct bio *bio = mg->new_ocell->holder;
+
+ avoid = is_discarded_oblock(cache, mg->new_oblock);
+
+ if (writeback_mode(&cache->features) &&
+ !avoid && bio_writes_complete_block(cache, bio)) {
+ issue_overwrite(mg, bio);
+ return;
+ }
+ }
+
+ avoid ? avoid_copy(mg) : issue_copy(mg);
+}
+
+static void complete_migration(struct dm_cache_migration *mg)
+{
+ if (mg->err)
+ migration_failure(mg);
+ else
+ migration_success_pre_commit(mg);
+}
+
+static void process_migrations(struct cache *cache, struct list_head *head,
+ void (*fn)(struct dm_cache_migration *))
+{
+ unsigned long flags;
+ struct list_head list;
+ struct dm_cache_migration *mg, *tmp;
+
+ INIT_LIST_HEAD(&list);
+ spin_lock_irqsave(&cache->lock, flags);
+ list_splice_init(head, &list);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ list_for_each_entry_safe(mg, tmp, &list, list)
+ fn(mg);
+}
+
+static void __queue_quiesced_migration(struct dm_cache_migration *mg)
+{
+ list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+}
+
+static void queue_quiesced_migration(struct dm_cache_migration *mg)
+{
+ unsigned long flags;
+ struct cache *cache = mg->cache;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ __queue_quiesced_migration(mg);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
+{
+ unsigned long flags;
+ struct dm_cache_migration *mg, *tmp;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_for_each_entry_safe(mg, tmp, work, list)
+ __queue_quiesced_migration(mg);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void check_for_quiesced_migrations(struct cache *cache,
+ struct per_bio_data *pb)
+{
+ struct list_head work;
+
+ if (!pb->all_io_entry)
+ return;
+
+ INIT_LIST_HEAD(&work);
+ dm_deferred_entry_dec(pb->all_io_entry, &work);
+
+ if (!list_empty(&work))
+ queue_quiesced_migrations(cache, &work);
+}
+
+static void quiesce_migration(struct dm_cache_migration *mg)
+{
+ if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
+ queue_quiesced_migration(mg);
+}
+
+static void promote(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->discard = false;
+ mg->writeback = false;
+ mg->demote = false;
+ mg->promote = true;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
+ mg->cache = cache;
+ mg->new_oblock = oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = NULL;
+ mg->new_ocell = cell;
+ mg->start_jiffies = jiffies;
+
+ inc_io_migrations(cache);
+ quiesce_migration(mg);
+}
+
+static void writeback(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->discard = false;
+ mg->writeback = true;
+ mg->demote = false;
+ mg->promote = false;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
+ mg->cache = cache;
+ mg->old_oblock = oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = cell;
+ mg->new_ocell = NULL;
+ mg->start_jiffies = jiffies;
+
+ inc_io_migrations(cache);
+ quiesce_migration(mg);
+}
+
+static void demote_then_promote(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t old_oblock, dm_oblock_t new_oblock,
+ dm_cblock_t cblock,
+ struct dm_bio_prison_cell *old_ocell,
+ struct dm_bio_prison_cell *new_ocell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->discard = false;
+ mg->writeback = false;
+ mg->demote = true;
+ mg->promote = true;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
+ mg->cache = cache;
+ mg->old_oblock = old_oblock;
+ mg->new_oblock = new_oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = old_ocell;
+ mg->new_ocell = new_ocell;
+ mg->start_jiffies = jiffies;
+
+ inc_io_migrations(cache);
+ quiesce_migration(mg);
+}
+
+/*
+ * Invalidate a cache entry. No writeback occurs; any changes in the cache
+ * block are thrown away.
+ */
+static void invalidate(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->discard = false;
+ mg->writeback = false;
+ mg->demote = true;
+ mg->promote = false;
+ mg->requeue_holder = true;
+ mg->invalidate = true;
+ mg->cache = cache;
+ mg->old_oblock = oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = cell;
+ mg->new_ocell = NULL;
+ mg->start_jiffies = jiffies;
+
+ inc_io_migrations(cache);
+ quiesce_migration(mg);
+}
+
+static void discard(struct cache *cache, struct prealloc *structs,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->discard = true;
+ mg->writeback = false;
+ mg->demote = false;
+ mg->promote = false;
+ mg->requeue_holder = false;
+ mg->invalidate = false;
+ mg->cache = cache;
+ mg->old_ocell = NULL;
+ mg->new_ocell = cell;
+ mg->start_jiffies = jiffies;
+
+ quiesce_migration(mg);
+}
+
+/*----------------------------------------------------------------
+ * bio processing
+ *--------------------------------------------------------------*/
+static void defer_bio(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_add(&cache->deferred_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void process_flush_bio(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ BUG_ON(bio->bi_iter.bi_size);
+ if (!pb->req_nr)
+ remap_to_origin(cache, bio);
+ else
+ remap_to_cache(cache, bio, 0);
+
+ /*
+ * REQ_FLUSH is not directed at any particular block so we don't
+ * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH
+ * by dm-core.
+ */
+ issue(cache, bio);
+}
+
+static void process_discard_bio(struct cache *cache, struct prealloc *structs,
+ struct bio *bio)
+{
+ int r;
+ dm_dblock_t b, e;
+ struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+
+ calc_discard_block_range(cache, bio, &b, &e);
+ if (b == e) {
+ bio_endio(bio, 0);
+ return;
+ }
+
+ cell_prealloc = prealloc_get_cell(structs);
+ r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ structs, &new_ocell);
+ if (r > 0)
+ return;
+
+ discard(cache, structs, new_ocell);
+}
+
+static bool spare_migration_bandwidth(struct cache *cache)
+{
+ sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
+ cache->sectors_per_block;
+ return current_volume < cache->migration_threshold;
+}
+
+static void inc_hit_counter(struct cache *cache, struct bio *bio)
+{
+ atomic_inc(bio_data_dir(bio) == READ ?
+ &cache->stats.read_hit : &cache->stats.write_hit);
+}
+
+static void inc_miss_counter(struct cache *cache, struct bio *bio)
+{
+ atomic_inc(bio_data_dir(bio) == READ ?
+ &cache->stats.read_miss : &cache->stats.write_miss);
+}
+
+/*----------------------------------------------------------------*/
+
+struct old_oblock_lock {
+ struct policy_locker locker;
+ struct cache *cache;
+ struct prealloc *structs;
+ struct dm_bio_prison_cell *cell;
+};
+
+static int null_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+ /* This should never be called */
+ BUG();
+ return 0;
+}
+
+static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+ struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
+ struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
+
+ return bio_detain(l->cache, b, NULL, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ l->structs, &l->cell);
+}
+
+static void process_bio(struct cache *cache, struct prealloc *structs,
+ struct bio *bio)
+{
+ int r;
+ bool release_cell = true;
+ dm_oblock_t block = get_bio_block(cache, bio);
+ struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+ struct policy_result lookup_result;
+ bool passthrough = passthrough_mode(&cache->features);
+ bool discarded_block, can_migrate;
+ struct old_oblock_lock ool;
+
+ /*
+ * Check to see if that block is currently migrating.
+ */
+ cell_prealloc = prealloc_get_cell(structs);
+ r = bio_detain(cache, block, bio, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ structs, &new_ocell);
+ if (r > 0)
+ return;
+
+ discarded_block = is_discarded_oblock(cache, block);
+ can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+
+ ool.locker.fn = cell_locker;
+ ool.cache = cache;
+ ool.structs = structs;
+ ool.cell = NULL;
+ r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
+ bio, &ool.locker, &lookup_result);
+
+ if (r == -EWOULDBLOCK)
+ /* migration has been denied */
+ lookup_result.op = POLICY_MISS;
+
+ switch (lookup_result.op) {
+ case POLICY_HIT:
+ if (passthrough) {
+ inc_miss_counter(cache, bio);
+
+ /*
+ * Passthrough always maps to the origin,
+ * invalidating any cache blocks that are written
+ * to.
+ */
+
+ if (bio_data_dir(bio) == WRITE) {
+ atomic_inc(&cache->stats.demotion);
+ invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
+ release_cell = false;
+
+ } else {
+ /* FIXME: factor out issue_origin() */
+ remap_to_origin_clear_discard(cache, bio, block);
+ inc_and_issue(cache, bio, new_ocell);
+ }
+ } else {
+ inc_hit_counter(cache, bio);
+
+ if (bio_data_dir(bio) == WRITE &&
+ writethrough_mode(&cache->features) &&
+ !is_dirty(cache, lookup_result.cblock)) {
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ inc_and_issue(cache, bio, new_ocell);
+
+ } else {
+ remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ inc_and_issue(cache, bio, new_ocell);
+ }
+ }
+
+ break;
+
+ case POLICY_MISS:
+ inc_miss_counter(cache, bio);
+ remap_to_origin_clear_discard(cache, bio, block);
+ inc_and_issue(cache, bio, new_ocell);
+ break;
+
+ case POLICY_NEW:
+ atomic_inc(&cache->stats.promotion);
+ promote(cache, structs, block, lookup_result.cblock, new_ocell);
+ release_cell = false;
+ break;
+
+ case POLICY_REPLACE:
+ atomic_inc(&cache->stats.demotion);
+ atomic_inc(&cache->stats.promotion);
+ demote_then_promote(cache, structs, lookup_result.old_oblock,
+ block, lookup_result.cblock,
+ ool.cell, new_ocell);
+ release_cell = false;
+ break;
+
+ default:
+ DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+ (unsigned) lookup_result.op);
+ bio_io_error(bio);
+ }
+
+ if (release_cell)
+ cell_defer(cache, new_ocell, false);
+}
+
+static int need_commit_due_to_time(struct cache *cache)
+{
+ return !time_in_range(jiffies, cache->last_commit_jiffies,
+ cache->last_commit_jiffies + COMMIT_PERIOD);
+}
+
+static int commit_if_needed(struct cache *cache)
+{
+ int r = 0;
+
+ if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
+ dm_cache_changed_this_transaction(cache->cmd)) {
+ atomic_inc(&cache->stats.commit_count);
+ cache->commit_requested = false;
+ r = dm_cache_commit(cache->cmd, false);
+ cache->last_commit_jiffies = jiffies;
+ }
+
+ return r;
+}
+
+static void process_deferred_bios(struct cache *cache)
+{
+ unsigned long flags;
+ struct bio_list bios;
+ struct bio *bio;
+ struct prealloc structs;
+
+ memset(&structs, 0, sizeof(structs));
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&bios, &cache->deferred_bios);
+ bio_list_init(&cache->deferred_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ while (!bio_list_empty(&bios)) {
+ /*
+ * If we've got no free migration structs, and processing
+ * this bio might require one, we pause until there are some
+ * prepared mappings to process.
+ */
+ if (prealloc_data_structs(cache, &structs)) {
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&cache->deferred_bios, &bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+ break;
+ }
+
+ bio = bio_list_pop(&bios);
+
+ if (bio->bi_rw & REQ_FLUSH)
+ process_flush_bio(cache, bio);
+ else if (bio->bi_rw & REQ_DISCARD)
+ process_discard_bio(cache, &structs, bio);
+ else
+ process_bio(cache, &structs, bio);
+ }
+
+ prealloc_free_structs(cache, &structs);
+}
+
+static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+{
+ unsigned long flags;
+ struct bio_list bios;
+ struct bio *bio;
+
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&bios, &cache->deferred_flush_bios);
+ bio_list_init(&cache->deferred_flush_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ /*
+ * These bios have already been through inc_ds()
+ */
+ while ((bio = bio_list_pop(&bios)))
+ submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+}
+
+static void process_deferred_writethrough_bios(struct cache *cache)
+{
+ unsigned long flags;
+ struct bio_list bios;
+ struct bio *bio;
+
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&bios, &cache->deferred_writethrough_bios);
+ bio_list_init(&cache->deferred_writethrough_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ /*
+ * These bios have already been through inc_ds()
+ */
+ while ((bio = bio_list_pop(&bios)))
+ generic_make_request(bio);
+}
+
+static void writeback_some_dirty_blocks(struct cache *cache)
+{
+ int r = 0;
+ dm_oblock_t oblock;
+ dm_cblock_t cblock;
+ struct prealloc structs;
+ struct dm_bio_prison_cell *old_ocell;
+
+ memset(&structs, 0, sizeof(structs));
+
+ while (spare_migration_bandwidth(cache)) {
+ if (prealloc_data_structs(cache, &structs))
+ break;
+
+ r = policy_writeback_work(cache->policy, &oblock, &cblock);
+ if (r)
+ break;
+
+ r = get_cell(cache, oblock, &structs, &old_ocell);
+ if (r) {
+ policy_set_dirty(cache->policy, oblock);
+ break;
+ }
+
+ writeback(cache, &structs, oblock, cblock, old_ocell);
+ }
+
+ prealloc_free_structs(cache, &structs);
+}
+
+/*----------------------------------------------------------------
+ * Invalidations.
+ * Dropping something from the cache *without* writing back.
+ *--------------------------------------------------------------*/
+
+static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
+{
+ int r = 0;
+ uint64_t begin = from_cblock(req->cblocks->begin);
+ uint64_t end = from_cblock(req->cblocks->end);
+
+ while (begin != end) {
+ r = policy_remove_cblock(cache->policy, to_cblock(begin));
+ if (!r) {
+ r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
+ if (r)
+ break;
+
+ } else if (r == -ENODATA) {
+ /* harmless, already unmapped */
+ r = 0;
+
+ } else {
+ DMERR("policy_remove_cblock failed");
+ break;
+ }
+
+ begin++;
+ }
+
+ cache->commit_requested = true;
+
+ req->err = r;
+ atomic_set(&req->complete, 1);
+
+ wake_up(&req->result_wait);
+}
+
+static void process_invalidation_requests(struct cache *cache)
+{
+ struct list_head list;
+ struct invalidation_request *req, *tmp;
+
+ INIT_LIST_HEAD(&list);
+ spin_lock(&cache->invalidation_lock);
+ list_splice_init(&cache->invalidation_requests, &list);
+ spin_unlock(&cache->invalidation_lock);
+
+ list_for_each_entry_safe (req, tmp, &list, list)
+ process_invalidation_request(cache, req);
+}
+
+/*----------------------------------------------------------------
+ * Main worker loop
+ *--------------------------------------------------------------*/
+static bool is_quiescing(struct cache *cache)
+{
+ return atomic_read(&cache->quiescing);
+}
+
+static void ack_quiescing(struct cache *cache)
+{
+ if (is_quiescing(cache)) {
+ atomic_inc(&cache->quiescing_ack);
+ wake_up(&cache->quiescing_wait);
+ }
+}
+
+static void wait_for_quiescing_ack(struct cache *cache)
+{
+ wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
+}
+
+static void start_quiescing(struct cache *cache)
+{
+ atomic_inc(&cache->quiescing);
+ wait_for_quiescing_ack(cache);
+}
+
+static void stop_quiescing(struct cache *cache)
+{
+ atomic_set(&cache->quiescing, 0);
+ atomic_set(&cache->quiescing_ack, 0);
+}
+
+static void wait_for_migrations(struct cache *cache)
+{
+ wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
+}
+
+static void stop_worker(struct cache *cache)
+{
+ cancel_delayed_work(&cache->waker);
+ flush_workqueue(cache->wq);
+}
+
+static void requeue_deferred_io(struct cache *cache)
+{
+ struct bio *bio;
+ struct bio_list bios;
+
+ bio_list_init(&bios);
+ bio_list_merge(&bios, &cache->deferred_bios);
+ bio_list_init(&cache->deferred_bios);
+
+ while ((bio = bio_list_pop(&bios)))
+ bio_endio(bio, DM_ENDIO_REQUEUE);
+}
+
+static int more_work(struct cache *cache)
+{
+ if (is_quiescing(cache))
+ return !list_empty(&cache->quiesced_migrations) ||
+ !list_empty(&cache->completed_migrations) ||
+ !list_empty(&cache->need_commit_migrations);
+ else
+ return !bio_list_empty(&cache->deferred_bios) ||
+ !bio_list_empty(&cache->deferred_flush_bios) ||
+ !bio_list_empty(&cache->deferred_writethrough_bios) ||
+ !list_empty(&cache->quiesced_migrations) ||
+ !list_empty(&cache->completed_migrations) ||
+ !list_empty(&cache->need_commit_migrations) ||
+ cache->invalidate;
+}
+
+static void do_worker(struct work_struct *ws)
+{
+ struct cache *cache = container_of(ws, struct cache, worker);
+
+ do {
+ if (!is_quiescing(cache)) {
+ writeback_some_dirty_blocks(cache);
+ process_deferred_writethrough_bios(cache);
+ process_deferred_bios(cache);
+ process_invalidation_requests(cache);
+ }
+
+ process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
+ process_migrations(cache, &cache->completed_migrations, complete_migration);
+
+ if (commit_if_needed(cache)) {
+ process_deferred_flush_bios(cache, false);
+ process_migrations(cache, &cache->need_commit_migrations, migration_failure);
+
+ /*
+ * FIXME: rollback metadata or just go into a
+ * failure mode and error everything
+ */
+ } else {
+ process_deferred_flush_bios(cache, true);
+ process_migrations(cache, &cache->need_commit_migrations,
+ migration_success_post_commit);
+ }
+
+ ack_quiescing(cache);
+
+ } while (more_work(cache));
+}
+
+/*
+ * We want to commit periodically so that not too much
+ * unwritten metadata builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+ struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+ policy_tick(cache->policy);
+ wake_worker(cache);
+ queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
+}
+
+/*----------------------------------------------------------------*/
+
+static int is_congested(struct dm_dev *dev, int bdi_bits)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+ return bdi_congested(&q->backing_dev_info, bdi_bits);
+}
+
+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+ struct cache *cache = container_of(cb, struct cache, callbacks);
+
+ return is_congested(cache->origin_dev, bdi_bits) ||
+ is_congested(cache->cache_dev, bdi_bits);
+}
+
+/*----------------------------------------------------------------
+ * Target methods
+ *--------------------------------------------------------------*/
+
+/*
+ * This function gets called on the error paths of the constructor, so we
+ * have to cope with a partially initialised struct.
+ */
+static void destroy(struct cache *cache)
+{
+ unsigned i;
+
+ if (cache->migration_pool)
+ mempool_destroy(cache->migration_pool);
+
+ if (cache->all_io_ds)
+ dm_deferred_set_destroy(cache->all_io_ds);
+
+ if (cache->prison)
+ dm_bio_prison_destroy(cache->prison);
+
+ if (cache->wq)
+ destroy_workqueue(cache->wq);
+
+ if (cache->dirty_bitset)
+ free_bitset(cache->dirty_bitset);
+
+ if (cache->discard_bitset)
+ free_bitset(cache->discard_bitset);
+
+ if (cache->copier)
+ dm_kcopyd_client_destroy(cache->copier);
+
+ if (cache->cmd)
+ dm_cache_metadata_close(cache->cmd);
+
+ if (cache->metadata_dev)
+ dm_put_device(cache->ti, cache->metadata_dev);
+
+ if (cache->origin_dev)
+ dm_put_device(cache->ti, cache->origin_dev);
+
+ if (cache->cache_dev)
+ dm_put_device(cache->ti, cache->cache_dev);
+
+ if (cache->policy)
+ dm_cache_policy_destroy(cache->policy);
+
+ for (i = 0; i < cache->nr_ctr_args ; i++)
+ kfree(cache->ctr_args[i]);
+ kfree(cache->ctr_args);
+
+ kfree(cache);
+}
+
+static void cache_dtr(struct dm_target *ti)
+{
+ struct cache *cache = ti->private;
+
+ destroy(cache);
+}
+
+static sector_t get_dev_size(struct dm_dev *dev)
+{
+ return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Construct a cache device mapping.
+ *
+ * cache <metadata dev> <cache dev> <origin dev> <block size>
+ * <#feature args> [<feature arg>]*
+ * <policy> <#policy args> [<policy arg>]*
+ *
+ * metadata dev : fast device holding the persistent metadata
+ * cache dev : fast device holding cached data blocks
+ * origin dev : slow device holding original data blocks
+ * block size : cache unit size in sectors
+ *
+ * #feature args : number of feature arguments passed
+ * feature args : writethrough. (The default is writeback.)
+ *
+ * policy : the replacement policy to use
+ * #policy args : an even number of policy arguments corresponding
+ * to key/value pairs passed to the policy
+ * policy args : key/value pairs passed to the policy
+ * E.g. 'sequential_threshold 1024'
+ * See cache-policies.txt for details.
+ *
+ * Optional feature arguments are:
+ * writethrough : write through caching that prohibits cache block
+ * content from being different from origin block content.
+ * Without this argument, the default behaviour is to write
+ * back cache block contents later for performance reasons,
+ * so they may differ from the corresponding origin blocks.
+ */
+struct cache_args {
+ struct dm_target *ti;
+
+ struct dm_dev *metadata_dev;
+
+ struct dm_dev *cache_dev;
+ sector_t cache_sectors;
+
+ struct dm_dev *origin_dev;
+ sector_t origin_sectors;
+
+ uint32_t block_size;
+
+ const char *policy_name;
+ int policy_argc;
+ const char **policy_argv;
+
+ struct cache_features features;
+};
+
+static void destroy_cache_args(struct cache_args *ca)
+{
+ if (ca->metadata_dev)
+ dm_put_device(ca->ti, ca->metadata_dev);
+
+ if (ca->cache_dev)
+ dm_put_device(ca->ti, ca->cache_dev);
+
+ if (ca->origin_dev)
+ dm_put_device(ca->ti, ca->origin_dev);
+
+ kfree(ca);
+}
+
+static bool at_least_one_arg(struct dm_arg_set *as, char **error)
+{
+ if (!as->argc) {
+ *error = "Insufficient args";
+ return false;
+ }
+
+ return true;
+}
+
+static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ int r;
+ sector_t metadata_dev_size;
+ char b[BDEVNAME_SIZE];
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &ca->metadata_dev);
+ if (r) {
+ *error = "Error opening metadata device";
+ return r;
+ }
+
+ metadata_dev_size = get_dev_size(ca->metadata_dev);
+ if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
+ DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+ bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
+
+ return 0;
+}
+
+static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ int r;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &ca->cache_dev);
+ if (r) {
+ *error = "Error opening cache device";
+ return r;
+ }
+ ca->cache_sectors = get_dev_size(ca->cache_dev);
+
+ return 0;
+}
+
+static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ int r;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &ca->origin_dev);
+ if (r) {
+ *error = "Error opening origin device";
+ return r;
+ }
+
+ ca->origin_sectors = get_dev_size(ca->origin_dev);
+ if (ca->ti->len > ca->origin_sectors) {
+ *error = "Device size larger than cached device";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ unsigned long block_size;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
+ block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
+ block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
+ block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
+ *error = "Invalid data block size";
+ return -EINVAL;
+ }
+
+ if (block_size > ca->cache_sectors) {
+ *error = "Data block size is larger than the cache device";
+ return -EINVAL;
+ }
+
+ ca->block_size = block_size;
+
+ return 0;
+}
+
+static void init_features(struct cache_features *cf)
+{
+ cf->mode = CM_WRITE;
+ cf->io_mode = CM_IO_WRITEBACK;
+}
+
+static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ static struct dm_arg _args[] = {
+ {0, 1, "Invalid number of cache feature arguments"},
+ };
+
+ int r;
+ unsigned argc;
+ const char *arg;
+ struct cache_features *cf = &ca->features;
+
+ init_features(cf);
+
+ r = dm_read_arg_group(_args, as, &argc, error);
+ if (r)
+ return -EINVAL;
+
+ while (argc--) {
+ arg = dm_shift_arg(as);
+
+ if (!strcasecmp(arg, "writeback"))
+ cf->io_mode = CM_IO_WRITEBACK;
+
+ else if (!strcasecmp(arg, "writethrough"))
+ cf->io_mode = CM_IO_WRITETHROUGH;
+
+ else if (!strcasecmp(arg, "passthrough"))
+ cf->io_mode = CM_IO_PASSTHROUGH;
+
+ else {
+ *error = "Unrecognised cache feature requested";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
+ char **error)
+{
+ static struct dm_arg _args[] = {
+ {0, 1024, "Invalid number of policy arguments"},
+ };
+
+ int r;
+
+ if (!at_least_one_arg(as, error))
+ return -EINVAL;
+
+ ca->policy_name = dm_shift_arg(as);
+
+ r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
+ if (r)
+ return -EINVAL;
+
+ ca->policy_argv = (const char **)as->argv;
+ dm_consume_args(as, ca->policy_argc);
+
+ return 0;
+}
+
+static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
+ char **error)
+{
+ int r;
+ struct dm_arg_set as;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ r = parse_metadata_dev(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_cache_dev(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_origin_dev(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_block_size(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_features(ca, &as, error);
+ if (r)
+ return r;
+
+ r = parse_policy(ca, &as, error);
+ if (r)
+ return r;
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct kmem_cache *migration_cache;
+
+#define NOT_CORE_OPTION 1
+
+static int process_config_option(struct cache *cache, const char *key, const char *value)
+{
+ unsigned long tmp;
+
+ if (!strcasecmp(key, "migration_threshold")) {
+ if (kstrtoul(value, 10, &tmp))
+ return -EINVAL;
+
+ cache->migration_threshold = tmp;
+ return 0;
+ }
+
+ return NOT_CORE_OPTION;
+}
+
+static int set_config_value(struct cache *cache, const char *key, const char *value)
+{
+ int r = process_config_option(cache, key, value);
+
+ if (r == NOT_CORE_OPTION)
+ r = policy_set_config_value(cache->policy, key, value);
+
+ if (r)
+ DMWARN("bad config value for %s: %s", key, value);
+
+ return r;
+}
+
+static int set_config_values(struct cache *cache, int argc, const char **argv)
+{
+ int r = 0;
+
+ if (argc & 1) {
+ DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
+ return -EINVAL;
+ }
+
+ while (argc) {
+ r = set_config_value(cache, argv[0], argv[1]);
+ if (r)
+ break;
+
+ argc -= 2;
+ argv += 2;
+ }
+
+ return r;
+}
+
+static int create_cache_policy(struct cache *cache, struct cache_args *ca,
+ char **error)
+{
+ struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
+ cache->cache_size,
+ cache->origin_sectors,
+ cache->sectors_per_block);
+ if (IS_ERR(p)) {
+ *error = "Error creating cache's policy";
+ return PTR_ERR(p);
+ }
+ cache->policy = p;
+
+ return 0;
+}
+
+/*
+ * We want the discard block size to be at least the size of the cache
+ * block size and have no more than 2^14 discard blocks across the origin.
+ */
+#define MAX_DISCARD_BLOCKS (1 << 14)
+
+static bool too_many_discard_blocks(sector_t discard_block_size,
+ sector_t origin_size)
+{
+ (void) sector_div(origin_size, discard_block_size);
+
+ return origin_size > MAX_DISCARD_BLOCKS;
+}
+
+static sector_t calculate_discard_block_size(sector_t cache_block_size,
+ sector_t origin_size)
+{
+ sector_t discard_block_size = cache_block_size;
+
+ if (origin_size)
+ while (too_many_discard_blocks(discard_block_size, origin_size))
+ discard_block_size *= 2;
+
+ return discard_block_size;
+}
+
+static void set_cache_size(struct cache *cache, dm_cblock_t size)
+{
+ dm_block_t nr_blocks = from_cblock(size);
+
+ if (nr_blocks > (1 << 20) && cache->cache_size != size)
+ DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
+ "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
+ "Please consider increasing the cache block size to reduce the overall cache block count.",
+ (unsigned long long) nr_blocks);
+
+ cache->cache_size = size;
+}
+
+#define DEFAULT_MIGRATION_THRESHOLD 2048
+
+static int cache_create(struct cache_args *ca, struct cache **result)
+{
+ int r = 0;
+ char **error = &ca->ti->error;
+ struct cache *cache;
+ struct dm_target *ti = ca->ti;
+ dm_block_t origin_blocks;
+ struct dm_cache_metadata *cmd;
+ bool may_format = ca->features.mode == CM_WRITE;
+
+ cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+ if (!cache)
+ return -ENOMEM;
+
+ cache->ti = ca->ti;
+ ti->private = cache;
+ ti->num_flush_bios = 2;
+ ti->flush_supported = true;
+
+ ti->num_discard_bios = 1;
+ ti->discards_supported = true;
+ ti->discard_zeroes_data_unsupported = true;
+ ti->split_discard_bios = false;
+
+ cache->features = ca->features;
+ ti->per_bio_data_size = get_per_bio_data_size(cache);
+
+ cache->callbacks.congested_fn = cache_is_congested;
+ dm_table_add_target_callbacks(ti->table, &cache->callbacks);
+
+ cache->metadata_dev = ca->metadata_dev;
+ cache->origin_dev = ca->origin_dev;
+ cache->cache_dev = ca->cache_dev;
+
+ ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
+
+ /* FIXME: factor out this whole section */
+ origin_blocks = cache->origin_sectors = ca->origin_sectors;
+ origin_blocks = block_div(origin_blocks, ca->block_size);
+ cache->origin_blocks = to_oblock(origin_blocks);
+
+ cache->sectors_per_block = ca->block_size;
+ if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (ca->block_size & (ca->block_size - 1)) {
+ dm_block_t cache_size = ca->cache_sectors;
+
+ cache->sectors_per_block_shift = -1;
+ cache_size = block_div(cache_size, ca->block_size);
+ set_cache_size(cache, to_cblock(cache_size));
+ } else {
+ cache->sectors_per_block_shift = __ffs(ca->block_size);
+ set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
+ }
+
+ r = create_cache_policy(cache, ca, error);
+ if (r)
+ goto bad;
+
+ cache->policy_nr_args = ca->policy_argc;
+ cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
+
+ r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
+ if (r) {
+ *error = "Error setting cache policy's config values";
+ goto bad;
+ }
+
+ cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
+ ca->block_size, may_format,
+ dm_cache_policy_get_hint_size(cache->policy));
+ if (IS_ERR(cmd)) {
+ *error = "Error creating metadata object";
+ r = PTR_ERR(cmd);
+ goto bad;
+ }
+ cache->cmd = cmd;
+
+ if (passthrough_mode(&cache->features)) {
+ bool all_clean;
+
+ r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
+ if (r) {
+ *error = "dm_cache_metadata_all_clean() failed";
+ goto bad;
+ }
+
+ if (!all_clean) {
+ *error = "Cannot enter passthrough mode unless all blocks are clean";
+ r = -EINVAL;
+ goto bad;
+ }
+ }
+
+ spin_lock_init(&cache->lock);
+ bio_list_init(&cache->deferred_bios);
+ bio_list_init(&cache->deferred_flush_bios);
+ bio_list_init(&cache->deferred_writethrough_bios);
+ INIT_LIST_HEAD(&cache->quiesced_migrations);
+ INIT_LIST_HEAD(&cache->completed_migrations);
+ INIT_LIST_HEAD(&cache->need_commit_migrations);
+ atomic_set(&cache->nr_allocated_migrations, 0);
+ atomic_set(&cache->nr_io_migrations, 0);
+ init_waitqueue_head(&cache->migration_wait);
+
+ init_waitqueue_head(&cache->quiescing_wait);
+ atomic_set(&cache->quiescing, 0);
+ atomic_set(&cache->quiescing_ack, 0);
+
+ r = -ENOMEM;
+ atomic_set(&cache->nr_dirty, 0);
+ cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
+ if (!cache->dirty_bitset) {
+ *error = "could not allocate dirty bitset";
+ goto bad;
+ }
+ clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
+
+ cache->discard_block_size =
+ calculate_discard_block_size(cache->sectors_per_block,
+ cache->origin_sectors);
+ cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
+ cache->discard_block_size));
+ cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
+ if (!cache->discard_bitset) {
+ *error = "could not allocate discard bitset";
+ goto bad;
+ }
+ clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+
+ cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+ if (IS_ERR(cache->copier)) {
+ *error = "could not create kcopyd client";
+ r = PTR_ERR(cache->copier);
+ goto bad;
+ }
+
+ cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+ if (!cache->wq) {
+ *error = "could not create workqueue for metadata object";
+ goto bad;
+ }
+ INIT_WORK(&cache->worker, do_worker);
+ INIT_DELAYED_WORK(&cache->waker, do_waker);
+ cache->last_commit_jiffies = jiffies;
+
+ cache->prison = dm_bio_prison_create();
+ if (!cache->prison) {
+ *error = "could not create bio prison";
+ goto bad;
+ }
+
+ cache->all_io_ds = dm_deferred_set_create();
+ if (!cache->all_io_ds) {
+ *error = "could not create all_io deferred set";
+ goto bad;
+ }
+
+ cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
+ migration_cache);
+ if (!cache->migration_pool) {
+ *error = "Error creating cache's migration mempool";
+ goto bad;
+ }
+
+ cache->need_tick_bio = true;
+ cache->sized = false;
+ cache->invalidate = false;
+ cache->commit_requested = false;
+ cache->loaded_mappings = false;
+ cache->loaded_discards = false;
+
+ load_stats(cache);
+
+ atomic_set(&cache->stats.demotion, 0);
+ atomic_set(&cache->stats.promotion, 0);
+ atomic_set(&cache->stats.copies_avoided, 0);
+ atomic_set(&cache->stats.cache_cell_clash, 0);
+ atomic_set(&cache->stats.commit_count, 0);
+ atomic_set(&cache->stats.discard_count, 0);
+
+ spin_lock_init(&cache->invalidation_lock);
+ INIT_LIST_HEAD(&cache->invalidation_requests);
+
+ *result = cache;
+ return 0;
+
+bad:
+ destroy(cache);
+ return r;
+}
+
+static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
+{
+ unsigned i;
+ const char **copy;
+
+ copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
+ if (!copy)
+ return -ENOMEM;
+ for (i = 0; i < argc; i++) {
+ copy[i] = kstrdup(argv[i], GFP_KERNEL);
+ if (!copy[i]) {
+ while (i--)
+ kfree(copy[i]);
+ kfree(copy);
+ return -ENOMEM;
+ }
+ }
+
+ cache->nr_ctr_args = argc;
+ cache->ctr_args = copy;
+
+ return 0;
+}
+
+static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r = -EINVAL;
+ struct cache_args *ca;
+ struct cache *cache = NULL;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca) {
+ ti->error = "Error allocating memory for cache";
+ return -ENOMEM;
+ }
+ ca->ti = ti;
+
+ r = parse_cache_args(ca, argc, argv, &ti->error);
+ if (r)
+ goto out;
+
+ r = cache_create(ca, &cache);
+ if (r)
+ goto out;
+
+ r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
+ if (r) {
+ destroy(cache);
+ goto out;
+ }
+
+ ti->private = cache;
+
+out:
+ destroy_cache_args(ca);
+ return r;
+}
+
+static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
+{
+ int r;
+ dm_oblock_t block = get_bio_block(cache, bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ bool can_migrate = false;
+ bool discarded_block;
+ struct policy_result lookup_result;
+ struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
+ struct old_oblock_lock ool;
+
+ ool.locker.fn = null_locker;
+
+ if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
+ /*
+ * This can only occur if the io goes to a partial block at
+ * the end of the origin device. We don't cache these.
+ * Just remap to the origin and carry on.
+ */
+ remap_to_origin(cache, bio);
+ return DM_MAPIO_REMAPPED;
+ }
+
+ if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+ defer_bio(cache, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /*
+ * Check to see if that block is currently migrating.
+ */
+ *cell = alloc_prison_cell(cache);
+ if (!*cell) {
+ defer_bio(cache, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ r = bio_detain(cache, block, bio, *cell,
+ (cell_free_fn) free_prison_cell,
+ cache, cell);
+ if (r) {
+ if (r < 0)
+ defer_bio(cache, bio);
+
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ discarded_block = is_discarded_oblock(cache, block);
+
+ r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
+ bio, &ool.locker, &lookup_result);
+ if (r == -EWOULDBLOCK) {
+ cell_defer(cache, *cell, true);
+ return DM_MAPIO_SUBMITTED;
+
+ } else if (r) {
+ DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+ cell_defer(cache, *cell, false);
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ r = DM_MAPIO_REMAPPED;
+ switch (lookup_result.op) {
+ case POLICY_HIT:
+ if (passthrough_mode(&cache->features)) {
+ if (bio_data_dir(bio) == WRITE) {
+ /*
+ * We need to invalidate this block, so
+ * defer for the worker thread.
+ */
+ cell_defer(cache, *cell, true);
+ r = DM_MAPIO_SUBMITTED;
+
+ } else {
+ inc_miss_counter(cache, bio);
+ remap_to_origin_clear_discard(cache, bio, block);
+ }
+
+ } else {
+ inc_hit_counter(cache, bio);
+ if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+ !is_dirty(cache, lookup_result.cblock))
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ else
+ remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ }
+ break;
+
+ case POLICY_MISS:
+ inc_miss_counter(cache, bio);
+ if (pb->req_nr != 0) {
+ /*
+ * This is a duplicate writethrough io that is no
+ * longer needed because the block has been demoted.
+ */
+ bio_endio(bio, 0);
+ cell_defer(cache, *cell, false);
+ r = DM_MAPIO_SUBMITTED;
+
+ } else
+ remap_to_origin_clear_discard(cache, bio, block);
+
+ break;
+
+ default:
+ DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+ (unsigned) lookup_result.op);
+ cell_defer(cache, *cell, false);
+ bio_io_error(bio);
+ r = DM_MAPIO_SUBMITTED;
+ }
+
+ return r;
+}
+
+static int cache_map(struct dm_target *ti, struct bio *bio)
+{
+ int r;
+ struct dm_bio_prison_cell *cell = NULL;
+ struct cache *cache = ti->private;
+
+ r = __cache_map(cache, bio, &cell);
+ if (r == DM_MAPIO_REMAPPED && cell) {
+ inc_ds(cache, bio, cell);
+ cell_defer(cache, cell, false);
+ }
+
+ return r;
+}
+
+static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+ struct cache *cache = ti->private;
+ unsigned long flags;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ if (pb->tick) {
+ policy_tick(cache->policy);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ cache->need_tick_bio = true;
+ spin_unlock_irqrestore(&cache->lock, flags);
+ }
+
+ check_for_quiesced_migrations(cache, pb);
+
+ return 0;
+}
+
+static int write_dirty_bitset(struct cache *cache)
+{
+ unsigned i, r;
+
+ for (i = 0; i < from_cblock(cache->cache_size); i++) {
+ r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
+ is_dirty(cache, to_cblock(i)));
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int write_discard_bitset(struct cache *cache)
+{
+ unsigned i, r;
+
+ r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
+ cache->discard_nr_blocks);
+ if (r) {
+ DMERR("could not resize on-disk discard bitset");
+ return r;
+ }
+
+ for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
+ r = dm_cache_set_discard(cache->cmd, to_dblock(i),
+ is_discarded(cache, to_dblock(i)));
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+/*
+ * returns true on success
+ */
+static bool sync_metadata(struct cache *cache)
+{
+ int r1, r2, r3, r4;
+
+ r1 = write_dirty_bitset(cache);
+ if (r1)
+ DMERR("could not write dirty bitset");
+
+ r2 = write_discard_bitset(cache);
+ if (r2)
+ DMERR("could not write discard bitset");
+
+ save_stats(cache);
+
+ r3 = dm_cache_write_hints(cache->cmd, cache->policy);
+ if (r3)
+ DMERR("could not write hints");
+
+ /*
+ * If writing the above metadata failed, we still commit, but don't
+ * set the clean shutdown flag. This will effectively force every
+ * dirty bit to be set on reload.
+ */
+ r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+ if (r4)
+ DMERR("could not write cache metadata. Data loss may occur.");
+
+ return !r1 && !r2 && !r3 && !r4;
+}
+
+static void cache_postsuspend(struct dm_target *ti)
+{
+ struct cache *cache = ti->private;
+
+ start_quiescing(cache);
+ wait_for_migrations(cache);
+ stop_worker(cache);
+ requeue_deferred_io(cache);
+ stop_quiescing(cache);
+
+ (void) sync_metadata(cache);
+}
+
+static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
+ bool dirty, uint32_t hint, bool hint_valid)
+{
+ int r;
+ struct cache *cache = context;
+
+ r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+ if (r)
+ return r;
+
+ if (dirty)
+ set_dirty(cache, oblock, cblock);
+ else
+ clear_dirty(cache, oblock, cblock);
+
+ return 0;
+}
+
+/*
+ * The discard block size in the on disk metadata is not
+ * neccessarily the same as we're currently using. So we have to
+ * be careful to only set the discarded attribute if we know it
+ * covers a complete block of the new size.
+ */
+struct discard_load_info {
+ struct cache *cache;
+
+ /*
+ * These blocks are sized using the on disk dblock size, rather
+ * than the current one.
+ */
+ dm_block_t block_size;
+ dm_block_t discard_begin, discard_end;
+};
+
+static void discard_load_info_init(struct cache *cache,
+ struct discard_load_info *li)
+{
+ li->cache = cache;
+ li->discard_begin = li->discard_end = 0;
+}
+
+static void set_discard_range(struct discard_load_info *li)
+{
+ sector_t b, e;
+
+ if (li->discard_begin == li->discard_end)
+ return;
+
+ /*
+ * Convert to sectors.
+ */
+ b = li->discard_begin * li->block_size;
+ e = li->discard_end * li->block_size;
+
+ /*
+ * Then convert back to the current dblock size.
+ */
+ b = dm_sector_div_up(b, li->cache->discard_block_size);
+ sector_div(e, li->cache->discard_block_size);
+
+ /*
+ * The origin may have shrunk, so we need to check we're still in
+ * bounds.
+ */
+ if (e > from_dblock(li->cache->discard_nr_blocks))
+ e = from_dblock(li->cache->discard_nr_blocks);
+
+ for (; b < e; b++)
+ set_discard(li->cache, to_dblock(b));
+}
+
+static int load_discard(void *context, sector_t discard_block_size,
+ dm_dblock_t dblock, bool discard)
+{
+ struct discard_load_info *li = context;
+
+ li->block_size = discard_block_size;
+
+ if (discard) {
+ if (from_dblock(dblock) == li->discard_end)
+ /*
+ * We're already in a discard range, just extend it.
+ */
+ li->discard_end = li->discard_end + 1ULL;
+
+ else {
+ /*
+ * Emit the old range and start a new one.
+ */
+ set_discard_range(li);
+ li->discard_begin = from_dblock(dblock);
+ li->discard_end = li->discard_begin + 1ULL;
+ }
+ } else {
+ set_discard_range(li);
+ li->discard_begin = li->discard_end = 0;
+ }
+
+ return 0;
+}
+
+static dm_cblock_t get_cache_dev_size(struct cache *cache)
+{
+ sector_t size = get_dev_size(cache->cache_dev);
+ (void) sector_div(size, cache->sectors_per_block);
+ return to_cblock(size);
+}
+
+static bool can_resize(struct cache *cache, dm_cblock_t new_size)
+{
+ if (from_cblock(new_size) > from_cblock(cache->cache_size))
+ return true;
+
+ /*
+ * We can't drop a dirty block when shrinking the cache.
+ */
+ while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
+ new_size = to_cblock(from_cblock(new_size) + 1);
+ if (is_dirty(cache, new_size)) {
+ DMERR("unable to shrink cache; cache block %llu is dirty",
+ (unsigned long long) from_cblock(new_size));
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
+{
+ int r;
+
+ r = dm_cache_resize(cache->cmd, new_size);
+ if (r) {
+ DMERR("could not resize cache metadata");
+ return r;
+ }
+
+ set_cache_size(cache, new_size);
+
+ return 0;
+}
+
+static int cache_preresume(struct dm_target *ti)
+{
+ int r = 0;
+ struct cache *cache = ti->private;
+ dm_cblock_t csize = get_cache_dev_size(cache);
+
+ /*
+ * Check to see if the cache has resized.
+ */
+ if (!cache->sized) {
+ r = resize_cache_dev(cache, csize);
+ if (r)
+ return r;
+
+ cache->sized = true;
+
+ } else if (csize != cache->cache_size) {
+ if (!can_resize(cache, csize))
+ return -EINVAL;
+
+ r = resize_cache_dev(cache, csize);
+ if (r)
+ return r;
+ }
+
+ if (!cache->loaded_mappings) {
+ r = dm_cache_load_mappings(cache->cmd, cache->policy,
+ load_mapping, cache);
+ if (r) {
+ DMERR("could not load cache mappings");
+ return r;
+ }
+
+ cache->loaded_mappings = true;
+ }
+
+ if (!cache->loaded_discards) {
+ struct discard_load_info li;
+
+ /*
+ * The discard bitset could have been resized, or the
+ * discard block size changed. To be safe we start by
+ * setting every dblock to not discarded.
+ */
+ clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+
+ discard_load_info_init(cache, &li);
+ r = dm_cache_load_discards(cache->cmd, load_discard, &li);
+ if (r) {
+ DMERR("could not load origin discards");
+ return r;
+ }
+ set_discard_range(&li);
+
+ cache->loaded_discards = true;
+ }
+
+ return r;
+}
+
+static void cache_resume(struct dm_target *ti)
+{
+ struct cache *cache = ti->private;
+
+ cache->need_tick_bio = true;
+ do_waker(&cache->waker.work);
+}
+
+/*
+ * Status format:
+ *
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <cache block size> <#used cache blocks>/<#total cache blocks>
+ * <#read hits> <#read misses> <#write hits> <#write misses>
+ * <#demotions> <#promotions> <#dirty>
+ * <#features> <features>*
+ * <#core args> <core args>
+ * <policy name> <#policy args> <policy args>*
+ */
+static void cache_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ int r = 0;
+ unsigned i;
+ ssize_t sz = 0;
+ dm_block_t nr_free_blocks_metadata = 0;
+ dm_block_t nr_blocks_metadata = 0;
+ char buf[BDEVNAME_SIZE];
+ struct cache *cache = ti->private;
+ dm_cblock_t residency;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ /* Commit to ensure statistics aren't out-of-date */
+ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
+ r = dm_cache_commit(cache->cmd, false);
+ if (r)
+ DMERR("could not commit metadata for accurate status");
+ }
+
+ r = dm_cache_get_free_metadata_block_count(cache->cmd,
+ &nr_free_blocks_metadata);
+ if (r) {
+ DMERR("could not get metadata free block count");
+ goto err;
+ }
+
+ r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
+ if (r) {
+ DMERR("could not get metadata device size");
+ goto err;
+ }
+
+ residency = policy_residency(cache->policy);
+
+ DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
+ (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
+ (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
+ (unsigned long long)nr_blocks_metadata,
+ cache->sectors_per_block,
+ (unsigned long long) from_cblock(residency),
+ (unsigned long long) from_cblock(cache->cache_size),
+ (unsigned) atomic_read(&cache->stats.read_hit),
+ (unsigned) atomic_read(&cache->stats.read_miss),
+ (unsigned) atomic_read(&cache->stats.write_hit),
+ (unsigned) atomic_read(&cache->stats.write_miss),
+ (unsigned) atomic_read(&cache->stats.demotion),
+ (unsigned) atomic_read(&cache->stats.promotion),
+ (unsigned long) atomic_read(&cache->nr_dirty));
+
+ if (writethrough_mode(&cache->features))
+ DMEMIT("1 writethrough ");
+
+ else if (passthrough_mode(&cache->features))
+ DMEMIT("1 passthrough ");
+
+ else if (writeback_mode(&cache->features))
+ DMEMIT("1 writeback ");
+
+ else {
+ DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+ goto err;
+ }
+
+ DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
+
+ DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
+ if (sz < maxlen) {
+ r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+ if (r)
+ DMERR("policy_emit_config_values returned %d", r);
+ }
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+ format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+ format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
+ DMEMIT("%s", buf);
+
+ for (i = 0; i < cache->nr_ctr_args - 1; i++)
+ DMEMIT(" %s", cache->ctr_args[i]);
+ if (cache->nr_ctr_args)
+ DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
+ }
+
+ return;
+
+err:
+ DMEMIT("Error");
+}
+
+/*
+ * A cache block range can take two forms:
+ *
+ * i) A single cblock, eg. '3456'
+ * ii) A begin and end cblock with dots between, eg. 123-234
+ */
+static int parse_cblock_range(struct cache *cache, const char *str,
+ struct cblock_range *result)
+{
+ char dummy;
+ uint64_t b, e;
+ int r;
+
+ /*
+ * Try and parse form (ii) first.
+ */
+ r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
+ if (r < 0)
+ return r;
+
+ if (r == 2) {
+ result->begin = to_cblock(b);
+ result->end = to_cblock(e);
+ return 0;
+ }
+
+ /*
+ * That didn't work, try form (i).
+ */
+ r = sscanf(str, "%llu%c", &b, &dummy);
+ if (r < 0)
+ return r;
+
+ if (r == 1) {
+ result->begin = to_cblock(b);
+ result->end = to_cblock(from_cblock(result->begin) + 1u);
+ return 0;
+ }
+
+ DMERR("invalid cblock range '%s'", str);
+ return -EINVAL;
+}
+
+static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
+{
+ uint64_t b = from_cblock(range->begin);
+ uint64_t e = from_cblock(range->end);
+ uint64_t n = from_cblock(cache->cache_size);
+
+ if (b >= n) {
+ DMERR("begin cblock out of range: %llu >= %llu", b, n);
+ return -EINVAL;
+ }
+
+ if (e > n) {
+ DMERR("end cblock out of range: %llu > %llu", e, n);
+ return -EINVAL;
+ }
+
+ if (b >= e) {
+ DMERR("invalid cblock range: %llu >= %llu", b, e);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int request_invalidation(struct cache *cache, struct cblock_range *range)
+{
+ struct invalidation_request req;
+
+ INIT_LIST_HEAD(&req.list);
+ req.cblocks = range;
+ atomic_set(&req.complete, 0);
+ req.err = 0;
+ init_waitqueue_head(&req.result_wait);
+
+ spin_lock(&cache->invalidation_lock);
+ list_add(&req.list, &cache->invalidation_requests);
+ spin_unlock(&cache->invalidation_lock);
+ wake_worker(cache);
+
+ wait_event(req.result_wait, atomic_read(&req.complete));
+ return req.err;
+}
+
+static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
+ const char **cblock_ranges)
+{
+ int r = 0;
+ unsigned i;
+ struct cblock_range range;
+
+ if (!passthrough_mode(&cache->features)) {
+ DMERR("cache has to be in passthrough mode for invalidation");
+ return -EPERM;
+ }
+
+ for (i = 0; i < count; i++) {
+ r = parse_cblock_range(cache, cblock_ranges[i], &range);
+ if (r)
+ break;
+
+ r = validate_cblock_range(cache, &range);
+ if (r)
+ break;
+
+ /*
+ * Pass begin and end origin blocks to the worker and wake it.
+ */
+ r = request_invalidation(cache, &range);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+/*
+ * Supports
+ * "<key> <value>"
+ * and
+ * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
+ *
+ * The key migration_threshold is supported by the cache target core.
+ */
+static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct cache *cache = ti->private;
+
+ if (!argc)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "invalidate_cblocks"))
+ return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
+
+ if (argc != 2)
+ return -EINVAL;
+
+ return set_config_value(cache, argv[0], argv[1]);
+}
+
+static int cache_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ int r = 0;
+ struct cache *cache = ti->private;
+
+ r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
+ if (!r)
+ r = fn(ti, cache->origin_dev, 0, ti->len, data);
+
+ return r;
+}
+
+/*
+ * We assume I/O is going to the origin (which is the volume
+ * more likely to have restrictions e.g. by being striped).
+ * (Looking up the exact location of the data would be expensive
+ * and could always be out of date by the time the bio is submitted.)
+ */
+static int cache_bvec_merge(struct dm_target *ti,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct cache *cache = ti->private;
+ struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = cache->origin_dev->bdev;
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
+{
+ /*
+ * FIXME: these limits may be incompatible with the cache device
+ */
+ limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
+ cache->origin_sectors);
+ limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
+}
+
+static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct cache *cache = ti->private;
+ uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+ /*
+ * If the system-determined stacked limits are compatible with the
+ * cache's blocksize (io_opt is a factor) do not override them.
+ */
+ if (io_opt_sectors < cache->sectors_per_block ||
+ do_div(io_opt_sectors, cache->sectors_per_block)) {
+ blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
+ blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
+ }
+ set_discard_limits(cache, limits);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct target_type cache_target = {
+ .name = "cache",
+ .version = {1, 6, 0},
+ .module = THIS_MODULE,
+ .ctr = cache_ctr,
+ .dtr = cache_dtr,
+ .map = cache_map,
+ .end_io = cache_end_io,
+ .postsuspend = cache_postsuspend,
+ .preresume = cache_preresume,
+ .resume = cache_resume,
+ .status = cache_status,
+ .message = cache_message,
+ .iterate_devices = cache_iterate_devices,
+ .merge = cache_bvec_merge,
+ .io_hints = cache_io_hints,
+};
+
+static int __init dm_cache_init(void)
+{
+ int r;
+
+ r = dm_register_target(&cache_target);
+ if (r) {
+ DMERR("cache target registration failed: %d", r);
+ return r;
+ }
+
+ migration_cache = KMEM_CACHE(dm_cache_migration, 0);
+ if (!migration_cache) {
+ dm_unregister_target(&cache_target);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __exit dm_cache_exit(void)
+{
+ dm_unregister_target(&cache_target);
+ kmem_cache_destroy(migration_cache);
+}
+
+module_init(dm_cache_init);
+module_exit(dm_cache_exit);
+
+MODULE_DESCRIPTION(DM_NAME " cache target");
+MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
new file mode 100644
index 000000000..5503e43e5
--- /dev/null
+++ b/drivers/md/dm-crypt.c
@@ -0,0 +1,2080 @@
+/*
+ * Copyright (C) 2003 Jana Saout <jana@saout.de>
+ * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
+ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/backing-dev.h>
+#include <linux/atomic.h>
+#include <linux/scatterlist.h>
+#include <linux/rbtree.h>
+#include <asm/page.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/md5.h>
+#include <crypto/algapi.h>
+
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "crypt"
+
+/*
+ * context holding the current state of a multi-part conversion
+ */
+struct convert_context {
+ struct completion restart;
+ struct bio *bio_in;
+ struct bio *bio_out;
+ struct bvec_iter iter_in;
+ struct bvec_iter iter_out;
+ sector_t cc_sector;
+ atomic_t cc_pending;
+ struct ablkcipher_request *req;
+};
+
+/*
+ * per bio private data
+ */
+struct dm_crypt_io {
+ struct crypt_config *cc;
+ struct bio *base_bio;
+ struct work_struct work;
+
+ struct convert_context ctx;
+
+ atomic_t io_pending;
+ int error;
+ sector_t sector;
+
+ struct rb_node rb_node;
+} CRYPTO_MINALIGN_ATTR;
+
+struct dm_crypt_request {
+ struct convert_context *ctx;
+ struct scatterlist sg_in;
+ struct scatterlist sg_out;
+ sector_t iv_sector;
+};
+
+struct crypt_config;
+
+struct crypt_iv_operations {
+ int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts);
+ void (*dtr)(struct crypt_config *cc);
+ int (*init)(struct crypt_config *cc);
+ int (*wipe)(struct crypt_config *cc);
+ int (*generator)(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq);
+ int (*post)(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq);
+};
+
+struct iv_essiv_private {
+ struct crypto_hash *hash_tfm;
+ u8 *salt;
+};
+
+struct iv_benbi_private {
+ int shift;
+};
+
+#define LMK_SEED_SIZE 64 /* hash + 0 */
+struct iv_lmk_private {
+ struct crypto_shash *hash_tfm;
+ u8 *seed;
+};
+
+#define TCW_WHITENING_SIZE 16
+struct iv_tcw_private {
+ struct crypto_shash *crc32_tfm;
+ u8 *iv_seed;
+ u8 *whitening;
+};
+
+/*
+ * Crypt: maps a linear range of a block device
+ * and encrypts / decrypts at the same time.
+ */
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
+ DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+
+/*
+ * The fields in here must be read only after initialization.
+ */
+struct crypt_config {
+ struct dm_dev *dev;
+ sector_t start;
+
+ /*
+ * pool for per bio private data, crypto requests and
+ * encryption requeusts/buffer pages
+ */
+ mempool_t *req_pool;
+ mempool_t *page_pool;
+ struct bio_set *bs;
+ struct mutex bio_alloc_lock;
+
+ struct workqueue_struct *io_queue;
+ struct workqueue_struct *crypt_queue;
+
+ struct task_struct *write_thread;
+ wait_queue_head_t write_thread_wait;
+ struct rb_root write_tree;
+
+ char *cipher;
+ char *cipher_string;
+
+ struct crypt_iv_operations *iv_gen_ops;
+ union {
+ struct iv_essiv_private essiv;
+ struct iv_benbi_private benbi;
+ struct iv_lmk_private lmk;
+ struct iv_tcw_private tcw;
+ } iv_gen_private;
+ sector_t iv_offset;
+ unsigned int iv_size;
+
+ /* ESSIV: struct crypto_cipher *essiv_tfm */
+ void *iv_private;
+ struct crypto_ablkcipher **tfms;
+ unsigned tfms_count;
+
+ /*
+ * Layout of each crypto request:
+ *
+ * struct ablkcipher_request
+ * context
+ * padding
+ * struct dm_crypt_request
+ * padding
+ * IV
+ *
+ * The padding is added so that dm_crypt_request and the IV are
+ * correctly aligned.
+ */
+ unsigned int dmreq_start;
+
+ unsigned int per_bio_data_size;
+
+ unsigned long flags;
+ unsigned int key_size;
+ unsigned int key_parts; /* independent parts in key buffer */
+ unsigned int key_extra_size; /* additional keys length */
+ u8 key[0];
+};
+
+#define MIN_IOS 16
+
+static void clone_init(struct dm_crypt_io *, struct bio *);
+static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+ return cc->tfms[0];
+}
+
+/*
+ * Different IV generation algorithms:
+ *
+ * plain: the initial vector is the 32-bit little-endian version of the sector
+ * number, padded with zeros if necessary.
+ *
+ * plain64: the initial vector is the 64-bit little-endian version of the sector
+ * number, padded with zeros if necessary.
+ *
+ * essiv: "encrypted sector|salt initial vector", the sector number is
+ * encrypted with the bulk cipher using a salt as key. The salt
+ * should be derived from the bulk cipher's key via hashing.
+ *
+ * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1
+ * (needed for LRW-32-AES and possible other narrow block modes)
+ *
+ * null: the initial vector is always zero. Provides compatibility with
+ * obsolete loop_fish2 devices. Do not use for new devices.
+ *
+ * lmk: Compatible implementation of the block chaining mode used
+ * by the Loop-AES block device encryption system
+ * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
+ * It operates on full 512 byte sectors and uses CBC
+ * with an IV derived from the sector number, the data and
+ * optionally extra IV seed.
+ * This means that after decryption the first block
+ * of sector must be tweaked according to decrypted data.
+ * Loop-AES can use three encryption schemes:
+ * version 1: is plain aes-cbc mode
+ * version 2: uses 64 multikey scheme with lmk IV generator
+ * version 3: the same as version 2 with additional IV seed
+ * (it uses 65 keys, last key is used as IV seed)
+ *
+ * tcw: Compatible implementation of the block chaining mode used
+ * by the TrueCrypt device encryption system (prior to version 4.1).
+ * For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
+ * It operates on full 512 byte sectors and uses CBC
+ * with an IV derived from initial key and the sector number.
+ * In addition, whitening value is applied on every sector, whitening
+ * is calculated from initial key, sector number and mixed using CRC32.
+ * Note that this encryption scheme is vulnerable to watermarking attacks
+ * and should be used for old compatible containers access only.
+ *
+ * plumb: unimplemented, see:
+ * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
+ */
+
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ memset(iv, 0, cc->iv_size);
+ *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
+
+ return 0;
+}
+
+static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ memset(iv, 0, cc->iv_size);
+ *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
+
+ return 0;
+}
+
+/* Initialise ESSIV - compute salt but no local memory allocations */
+static int crypt_iv_essiv_init(struct crypt_config *cc)
+{
+ struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+ struct hash_desc desc;
+ struct scatterlist sg;
+ struct crypto_cipher *essiv_tfm;
+ int err;
+
+ sg_init_one(&sg, cc->key, cc->key_size);
+ desc.tfm = essiv->hash_tfm;
+ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
+ if (err)
+ return err;
+
+ essiv_tfm = cc->iv_private;
+
+ err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
+ crypto_hash_digestsize(essiv->hash_tfm));
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/* Wipe salt and reset key derived from volume key */
+static int crypt_iv_essiv_wipe(struct crypt_config *cc)
+{
+ struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+ unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+ struct crypto_cipher *essiv_tfm;
+ int r, err = 0;
+
+ memset(essiv->salt, 0, salt_size);
+
+ essiv_tfm = cc->iv_private;
+ r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+ if (r)
+ err = r;
+
+ return err;
+}
+
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+ struct dm_target *ti,
+ u8 *salt, unsigned saltsize)
+{
+ struct crypto_cipher *essiv_tfm;
+ int err;
+
+ /* Setup the essiv_tfm with the given salt */
+ essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(essiv_tfm)) {
+ ti->error = "Error allocating crypto tfm for ESSIV";
+ return essiv_tfm;
+ }
+
+ if (crypto_cipher_blocksize(essiv_tfm) !=
+ crypto_ablkcipher_ivsize(any_tfm(cc))) {
+ ti->error = "Block size of ESSIV cipher does "
+ "not match IV size of block cipher";
+ crypto_free_cipher(essiv_tfm);
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+ if (err) {
+ ti->error = "Failed to set key for ESSIV cipher";
+ crypto_free_cipher(essiv_tfm);
+ return ERR_PTR(err);
+ }
+
+ return essiv_tfm;
+}
+
+static void crypt_iv_essiv_dtr(struct crypt_config *cc)
+{
+ struct crypto_cipher *essiv_tfm;
+ struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
+
+ crypto_free_hash(essiv->hash_tfm);
+ essiv->hash_tfm = NULL;
+
+ kzfree(essiv->salt);
+ essiv->salt = NULL;
+
+ essiv_tfm = cc->iv_private;
+
+ if (essiv_tfm)
+ crypto_free_cipher(essiv_tfm);
+
+ cc->iv_private = NULL;
+}
+
+static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct crypto_cipher *essiv_tfm = NULL;
+ struct crypto_hash *hash_tfm = NULL;
+ u8 *salt = NULL;
+ int err;
+
+ if (!opts) {
+ ti->error = "Digest algorithm missing for ESSIV mode";
+ return -EINVAL;
+ }
+
+ /* Allocate hash algorithm */
+ hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(hash_tfm)) {
+ ti->error = "Error initializing ESSIV hash";
+ err = PTR_ERR(hash_tfm);
+ goto bad;
+ }
+
+ salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
+ if (!salt) {
+ ti->error = "Error kmallocing salt storage in ESSIV";
+ err = -ENOMEM;
+ goto bad;
+ }
+
+ cc->iv_gen_private.essiv.salt = salt;
+ cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
+
+ essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+ crypto_hash_digestsize(hash_tfm));
+ if (IS_ERR(essiv_tfm)) {
+ crypt_iv_essiv_dtr(cc);
+ return PTR_ERR(essiv_tfm);
+ }
+ cc->iv_private = essiv_tfm;
+
+ return 0;
+
+bad:
+ if (hash_tfm && !IS_ERR(hash_tfm))
+ crypto_free_hash(hash_tfm);
+ kfree(salt);
+ return err;
+}
+
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ struct crypto_cipher *essiv_tfm = cc->iv_private;
+
+ memset(iv, 0, cc->iv_size);
+ *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
+ crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
+
+ return 0;
+}
+
+static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
+ int log = ilog2(bs);
+
+ /* we need to calculate how far we must shift the sector count
+ * to get the cipher block count, we use this shift in _gen */
+
+ if (1 << log != bs) {
+ ti->error = "cypher blocksize is not a power of 2";
+ return -EINVAL;
+ }
+
+ if (log > 9) {
+ ti->error = "cypher blocksize is > 512";
+ return -EINVAL;
+ }
+
+ cc->iv_gen_private.benbi.shift = 9 - log;
+
+ return 0;
+}
+
+static void crypt_iv_benbi_dtr(struct crypt_config *cc)
+{
+}
+
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ __be64 val;
+
+ memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
+
+ val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
+ put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
+
+ return 0;
+}
+
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ memset(iv, 0, cc->iv_size);
+
+ return 0;
+}
+
+static void crypt_iv_lmk_dtr(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
+ crypto_free_shash(lmk->hash_tfm);
+ lmk->hash_tfm = NULL;
+
+ kzfree(lmk->seed);
+ lmk->seed = NULL;
+}
+
+static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(lmk->hash_tfm)) {
+ ti->error = "Error initializing LMK hash";
+ return PTR_ERR(lmk->hash_tfm);
+ }
+
+ /* No seed in LMK version 2 */
+ if (cc->key_parts == cc->tfms_count) {
+ lmk->seed = NULL;
+ return 0;
+ }
+
+ lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
+ if (!lmk->seed) {
+ crypt_iv_lmk_dtr(cc);
+ ti->error = "Error kmallocing seed storage in LMK";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int crypt_iv_lmk_init(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ int subkey_size = cc->key_size / cc->key_parts;
+
+ /* LMK seed is on the position of LMK_KEYS + 1 key */
+ if (lmk->seed)
+ memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
+ crypto_shash_digestsize(lmk->hash_tfm));
+
+ return 0;
+}
+
+static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+ if (lmk->seed)
+ memset(lmk->seed, 0, LMK_SEED_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq,
+ u8 *data)
+{
+ struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+ SHASH_DESC_ON_STACK(desc, lmk->hash_tfm);
+ struct md5_state md5state;
+ __le32 buf[4];
+ int i, r;
+
+ desc->tfm = lmk->hash_tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ r = crypto_shash_init(desc);
+ if (r)
+ return r;
+
+ if (lmk->seed) {
+ r = crypto_shash_update(desc, lmk->seed, LMK_SEED_SIZE);
+ if (r)
+ return r;
+ }
+
+ /* Sector is always 512B, block size 16, add data of blocks 1-31 */
+ r = crypto_shash_update(desc, data + 16, 16 * 31);
+ if (r)
+ return r;
+
+ /* Sector is cropped to 56 bits here */
+ buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
+ buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
+ buf[2] = cpu_to_le32(4024);
+ buf[3] = 0;
+ r = crypto_shash_update(desc, (u8 *)buf, sizeof(buf));
+ if (r)
+ return r;
+
+ /* No MD5 padding here */
+ r = crypto_shash_export(desc, &md5state);
+ if (r)
+ return r;
+
+ for (i = 0; i < MD5_HASH_WORDS; i++)
+ __cpu_to_le32s(&md5state.hash[i]);
+ memcpy(iv, &md5state.hash, cc->iv_size);
+
+ return 0;
+}
+
+static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *src;
+ int r = 0;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ src = kmap_atomic(sg_page(&dmreq->sg_in));
+ r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+ kunmap_atomic(src);
+ } else
+ memset(iv, 0, cc->iv_size);
+
+ return r;
+}
+
+static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *dst;
+ int r;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+ return 0;
+
+ dst = kmap_atomic(sg_page(&dmreq->sg_out));
+ r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+
+ /* Tweak the first block of plaintext sector */
+ if (!r)
+ crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+
+ kunmap_atomic(dst);
+ return r;
+}
+
+static void crypt_iv_tcw_dtr(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ kzfree(tcw->iv_seed);
+ tcw->iv_seed = NULL;
+ kzfree(tcw->whitening);
+ tcw->whitening = NULL;
+
+ if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
+ crypto_free_shash(tcw->crc32_tfm);
+ tcw->crc32_tfm = NULL;
+}
+
+static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
+ ti->error = "Wrong key size for TCW";
+ return -EINVAL;
+ }
+
+ tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0);
+ if (IS_ERR(tcw->crc32_tfm)) {
+ ti->error = "Error initializing CRC32 in TCW";
+ return PTR_ERR(tcw->crc32_tfm);
+ }
+
+ tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
+ tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
+ if (!tcw->iv_seed || !tcw->whitening) {
+ crypt_iv_tcw_dtr(cc);
+ ti->error = "Error allocating seed storage in TCW";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int crypt_iv_tcw_init(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE;
+
+ memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size);
+ memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size],
+ TCW_WHITENING_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_tcw_wipe(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ memset(tcw->iv_seed, 0, cc->iv_size);
+ memset(tcw->whitening, 0, TCW_WHITENING_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_tcw_whitening(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq,
+ u8 *data)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+ u8 buf[TCW_WHITENING_SIZE];
+ SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm);
+ int i, r;
+
+ /* xor whitening with sector number */
+ memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
+ crypto_xor(buf, (u8 *)&sector, 8);
+ crypto_xor(&buf[8], (u8 *)&sector, 8);
+
+ /* calculate crc32 for every 32bit part and xor it */
+ desc->tfm = tcw->crc32_tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ for (i = 0; i < 4; i++) {
+ r = crypto_shash_init(desc);
+ if (r)
+ goto out;
+ r = crypto_shash_update(desc, &buf[i * 4], 4);
+ if (r)
+ goto out;
+ r = crypto_shash_final(desc, &buf[i * 4]);
+ if (r)
+ goto out;
+ }
+ crypto_xor(&buf[0], &buf[12], 4);
+ crypto_xor(&buf[4], &buf[8], 4);
+
+ /* apply whitening (8 bytes) to whole sector */
+ for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
+ crypto_xor(data + i * 8, buf, 8);
+out:
+ memzero_explicit(buf, sizeof(buf));
+ return r;
+}
+
+static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+ u8 *src;
+ int r = 0;
+
+ /* Remove whitening from ciphertext */
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
+ src = kmap_atomic(sg_page(&dmreq->sg_in));
+ r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+ kunmap_atomic(src);
+ }
+
+ /* Calculate IV */
+ memcpy(iv, tcw->iv_seed, cc->iv_size);
+ crypto_xor(iv, (u8 *)&sector, 8);
+ if (cc->iv_size > 8)
+ crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
+
+ return r;
+}
+
+static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *dst;
+ int r;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
+ return 0;
+
+ /* Apply whitening on ciphertext */
+ dst = kmap_atomic(sg_page(&dmreq->sg_out));
+ r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+ kunmap_atomic(dst);
+
+ return r;
+}
+
+static struct crypt_iv_operations crypt_iv_plain_ops = {
+ .generator = crypt_iv_plain_gen
+};
+
+static struct crypt_iv_operations crypt_iv_plain64_ops = {
+ .generator = crypt_iv_plain64_gen
+};
+
+static struct crypt_iv_operations crypt_iv_essiv_ops = {
+ .ctr = crypt_iv_essiv_ctr,
+ .dtr = crypt_iv_essiv_dtr,
+ .init = crypt_iv_essiv_init,
+ .wipe = crypt_iv_essiv_wipe,
+ .generator = crypt_iv_essiv_gen
+};
+
+static struct crypt_iv_operations crypt_iv_benbi_ops = {
+ .ctr = crypt_iv_benbi_ctr,
+ .dtr = crypt_iv_benbi_dtr,
+ .generator = crypt_iv_benbi_gen
+};
+
+static struct crypt_iv_operations crypt_iv_null_ops = {
+ .generator = crypt_iv_null_gen
+};
+
+static struct crypt_iv_operations crypt_iv_lmk_ops = {
+ .ctr = crypt_iv_lmk_ctr,
+ .dtr = crypt_iv_lmk_dtr,
+ .init = crypt_iv_lmk_init,
+ .wipe = crypt_iv_lmk_wipe,
+ .generator = crypt_iv_lmk_gen,
+ .post = crypt_iv_lmk_post
+};
+
+static struct crypt_iv_operations crypt_iv_tcw_ops = {
+ .ctr = crypt_iv_tcw_ctr,
+ .dtr = crypt_iv_tcw_dtr,
+ .init = crypt_iv_tcw_init,
+ .wipe = crypt_iv_tcw_wipe,
+ .generator = crypt_iv_tcw_gen,
+ .post = crypt_iv_tcw_post
+};
+
+static void crypt_convert_init(struct crypt_config *cc,
+ struct convert_context *ctx,
+ struct bio *bio_out, struct bio *bio_in,
+ sector_t sector)
+{
+ ctx->bio_in = bio_in;
+ ctx->bio_out = bio_out;
+ if (bio_in)
+ ctx->iter_in = bio_in->bi_iter;
+ if (bio_out)
+ ctx->iter_out = bio_out->bi_iter;
+ ctx->cc_sector = sector + cc->iv_offset;
+ init_completion(&ctx->restart);
+}
+
+static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
+ struct ablkcipher_request *req)
+{
+ return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
+}
+
+static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
+}
+
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq)
+{
+ return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+ crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
+
+static int crypt_convert_block(struct crypt_config *cc,
+ struct convert_context *ctx,
+ struct ablkcipher_request *req)
+{
+ struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
+ struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
+ struct dm_crypt_request *dmreq;
+ u8 *iv;
+ int r;
+
+ dmreq = dmreq_of_req(cc, req);
+ iv = iv_of_dmreq(cc, dmreq);
+
+ dmreq->iv_sector = ctx->cc_sector;
+ dmreq->ctx = ctx;
+ sg_init_table(&dmreq->sg_in, 1);
+ sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
+ bv_in.bv_offset);
+
+ sg_init_table(&dmreq->sg_out, 1);
+ sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
+ bv_out.bv_offset);
+
+ bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
+ bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
+
+ if (cc->iv_gen_ops) {
+ r = cc->iv_gen_ops->generator(cc, iv, dmreq);
+ if (r < 0)
+ return r;
+ }
+
+ ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
+ 1 << SECTOR_SHIFT, iv);
+
+ if (bio_data_dir(ctx->bio_in) == WRITE)
+ r = crypto_ablkcipher_encrypt(req);
+ else
+ r = crypto_ablkcipher_decrypt(req);
+
+ if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+ r = cc->iv_gen_ops->post(cc, iv, dmreq);
+
+ return r;
+}
+
+static void kcryptd_async_done(struct crypto_async_request *async_req,
+ int error);
+
+static void crypt_alloc_req(struct crypt_config *cc,
+ struct convert_context *ctx)
+{
+ unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
+
+ if (!ctx->req)
+ ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+ ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+ ablkcipher_request_set_callback(ctx->req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ kcryptd_async_done, dmreq_of_req(cc, ctx->req));
+}
+
+static void crypt_free_req(struct crypt_config *cc,
+ struct ablkcipher_request *req, struct bio *base_bio)
+{
+ struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+
+ if ((struct ablkcipher_request *)(io + 1) != req)
+ mempool_free(req, cc->req_pool);
+}
+
+/*
+ * Encrypt / decrypt data from one bio to another one (can be the same one)
+ */
+static int crypt_convert(struct crypt_config *cc,
+ struct convert_context *ctx)
+{
+ int r;
+
+ atomic_set(&ctx->cc_pending, 1);
+
+ while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
+
+ crypt_alloc_req(cc, ctx);
+
+ atomic_inc(&ctx->cc_pending);
+
+ r = crypt_convert_block(cc, ctx, ctx->req);
+
+ switch (r) {
+ /* async */
+ case -EBUSY:
+ wait_for_completion(&ctx->restart);
+ reinit_completion(&ctx->restart);
+ /* fall through*/
+ case -EINPROGRESS:
+ ctx->req = NULL;
+ ctx->cc_sector++;
+ continue;
+
+ /* sync */
+ case 0:
+ atomic_dec(&ctx->cc_pending);
+ ctx->cc_sector++;
+ cond_resched();
+ continue;
+
+ /* error */
+ default:
+ atomic_dec(&ctx->cc_pending);
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
+
+/*
+ * Generate a new unfragmented bio with the given size
+ * This should never violate the device limitations
+ *
+ * This function may be called concurrently. If we allocate from the mempool
+ * concurrently, there is a possibility of deadlock. For example, if we have
+ * mempool of 256 pages, two processes, each wanting 256, pages allocate from
+ * the mempool concurrently, it may deadlock in a situation where both processes
+ * have allocated 128 pages and the mempool is exhausted.
+ *
+ * In order to avoid this scenario we allocate the pages under a mutex.
+ *
+ * In order to not degrade performance with excessive locking, we try
+ * non-blocking allocations without a mutex first but on failure we fallback
+ * to blocking allocations with a mutex.
+ */
+static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
+{
+ struct crypt_config *cc = io->cc;
+ struct bio *clone;
+ unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
+ unsigned i, len, remaining_size;
+ struct page *page;
+ struct bio_vec *bvec;
+
+retry:
+ if (unlikely(gfp_mask & __GFP_WAIT))
+ mutex_lock(&cc->bio_alloc_lock);
+
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
+ if (!clone)
+ goto return_clone;
+
+ clone_init(io, clone);
+
+ remaining_size = size;
+
+ for (i = 0; i < nr_iovecs; i++) {
+ page = mempool_alloc(cc->page_pool, gfp_mask);
+ if (!page) {
+ crypt_free_buffer_pages(cc, clone);
+ bio_put(clone);
+ gfp_mask |= __GFP_WAIT;
+ goto retry;
+ }
+
+ len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
+
+ bvec = &clone->bi_io_vec[clone->bi_vcnt++];
+ bvec->bv_page = page;
+ bvec->bv_len = len;
+ bvec->bv_offset = 0;
+
+ clone->bi_iter.bi_size += len;
+
+ remaining_size -= len;
+ }
+
+return_clone:
+ if (unlikely(gfp_mask & __GFP_WAIT))
+ mutex_unlock(&cc->bio_alloc_lock);
+
+ return clone;
+}
+
+static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
+{
+ unsigned int i;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, clone, i) {
+ BUG_ON(!bv->bv_page);
+ mempool_free(bv->bv_page, cc->page_pool);
+ bv->bv_page = NULL;
+ }
+}
+
+static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
+ struct bio *bio, sector_t sector)
+{
+ io->cc = cc;
+ io->base_bio = bio;
+ io->sector = sector;
+ io->error = 0;
+ io->ctx.req = NULL;
+ atomic_set(&io->io_pending, 0);
+}
+
+static void crypt_inc_pending(struct dm_crypt_io *io)
+{
+ atomic_inc(&io->io_pending);
+}
+
+/*
+ * One of the bios was finished. Check for completion of
+ * the whole request and correctly clean up the buffer.
+ */
+static void crypt_dec_pending(struct dm_crypt_io *io)
+{
+ struct crypt_config *cc = io->cc;
+ struct bio *base_bio = io->base_bio;
+ int error = io->error;
+
+ if (!atomic_dec_and_test(&io->io_pending))
+ return;
+
+ if (io->ctx.req)
+ crypt_free_req(cc, io->ctx.req, base_bio);
+
+ bio_endio(base_bio, error);
+}
+
+/*
+ * kcryptd/kcryptd_io:
+ *
+ * Needed because it would be very unwise to do decryption in an
+ * interrupt context.
+ *
+ * kcryptd performs the actual encryption or decryption.
+ *
+ * kcryptd_io performs the IO submission.
+ *
+ * They must be separated as otherwise the final stages could be
+ * starved by new requests which can block in the first stages due
+ * to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
+ */
+static void crypt_endio(struct bio *clone, int error)
+{
+ struct dm_crypt_io *io = clone->bi_private;
+ struct crypt_config *cc = io->cc;
+ unsigned rw = bio_data_dir(clone);
+
+ if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
+ error = -EIO;
+
+ /*
+ * free the processed pages
+ */
+ if (rw == WRITE)
+ crypt_free_buffer_pages(cc, clone);
+
+ bio_put(clone);
+
+ if (rw == READ && !error) {
+ kcryptd_queue_crypt(io);
+ return;
+ }
+
+ if (unlikely(error))
+ io->error = error;
+
+ crypt_dec_pending(io);
+}
+
+static void clone_init(struct dm_crypt_io *io, struct bio *clone)
+{
+ struct crypt_config *cc = io->cc;
+
+ clone->bi_private = io;
+ clone->bi_end_io = crypt_endio;
+ clone->bi_bdev = cc->dev->bdev;
+ clone->bi_rw = io->base_bio->bi_rw;
+}
+
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
+{
+ struct crypt_config *cc = io->cc;
+ struct bio *clone;
+
+ /*
+ * We need the original biovec array in order to decrypt
+ * the whole bio data *afterwards* -- thanks to immutable
+ * biovecs we don't need to worry about the block layer
+ * modifying the biovec array; so leverage bio_clone_fast().
+ */
+ clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
+ if (!clone)
+ return 1;
+
+ crypt_inc_pending(io);
+
+ clone_init(io, clone);
+ clone->bi_iter.bi_sector = cc->start + io->sector;
+
+ generic_make_request(clone);
+ return 0;
+}
+
+static void kcryptd_io_read_work(struct work_struct *work)
+{
+ struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+
+ crypt_inc_pending(io);
+ if (kcryptd_io_read(io, GFP_NOIO))
+ io->error = -ENOMEM;
+ crypt_dec_pending(io);
+}
+
+static void kcryptd_queue_read(struct dm_crypt_io *io)
+{
+ struct crypt_config *cc = io->cc;
+
+ INIT_WORK(&io->work, kcryptd_io_read_work);
+ queue_work(cc->io_queue, &io->work);
+}
+
+static void kcryptd_io_write(struct dm_crypt_io *io)
+{
+ struct bio *clone = io->ctx.bio_out;
+
+ generic_make_request(clone);
+}
+
+#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node)
+
+static int dmcrypt_write(void *data)
+{
+ struct crypt_config *cc = data;
+ struct dm_crypt_io *io;
+
+ while (1) {
+ struct rb_root write_tree;
+ struct blk_plug plug;
+
+ DECLARE_WAITQUEUE(wait, current);
+
+ spin_lock_irq(&cc->write_thread_wait.lock);
+continue_locked:
+
+ if (!RB_EMPTY_ROOT(&cc->write_tree))
+ goto pop_from_list;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ __add_wait_queue(&cc->write_thread_wait, &wait);
+
+ spin_unlock_irq(&cc->write_thread_wait.lock);
+
+ if (unlikely(kthread_should_stop())) {
+ set_task_state(current, TASK_RUNNING);
+ remove_wait_queue(&cc->write_thread_wait, &wait);
+ break;
+ }
+
+ schedule();
+
+ set_task_state(current, TASK_RUNNING);
+ spin_lock_irq(&cc->write_thread_wait.lock);
+ __remove_wait_queue(&cc->write_thread_wait, &wait);
+ goto continue_locked;
+
+pop_from_list:
+ write_tree = cc->write_tree;
+ cc->write_tree = RB_ROOT;
+ spin_unlock_irq(&cc->write_thread_wait.lock);
+
+ BUG_ON(rb_parent(write_tree.rb_node));
+
+ /*
+ * Note: we cannot walk the tree here with rb_next because
+ * the structures may be freed when kcryptd_io_write is called.
+ */
+ blk_start_plug(&plug);
+ do {
+ io = crypt_io_from_node(rb_first(&write_tree));
+ rb_erase(&io->rb_node, &write_tree);
+ kcryptd_io_write(io);
+ } while (!RB_EMPTY_ROOT(&write_tree));
+ blk_finish_plug(&plug);
+ }
+ return 0;
+}
+
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
+{
+ struct bio *clone = io->ctx.bio_out;
+ struct crypt_config *cc = io->cc;
+ unsigned long flags;
+ sector_t sector;
+ struct rb_node **rbp, *parent;
+
+ if (unlikely(io->error < 0)) {
+ crypt_free_buffer_pages(cc, clone);
+ bio_put(clone);
+ crypt_dec_pending(io);
+ return;
+ }
+
+ /* crypt_convert should have filled the clone bio */
+ BUG_ON(io->ctx.iter_out.bi_size);
+
+ clone->bi_iter.bi_sector = cc->start + io->sector;
+
+ if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) {
+ generic_make_request(clone);
+ return;
+ }
+
+ spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
+ rbp = &cc->write_tree.rb_node;
+ parent = NULL;
+ sector = io->sector;
+ while (*rbp) {
+ parent = *rbp;
+ if (sector < crypt_io_from_node(parent)->sector)
+ rbp = &(*rbp)->rb_left;
+ else
+ rbp = &(*rbp)->rb_right;
+ }
+ rb_link_node(&io->rb_node, parent, rbp);
+ rb_insert_color(&io->rb_node, &cc->write_tree);
+
+ wake_up_locked(&cc->write_thread_wait);
+ spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
+}
+
+static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
+{
+ struct crypt_config *cc = io->cc;
+ struct bio *clone;
+ int crypt_finished;
+ sector_t sector = io->sector;
+ int r;
+
+ /*
+ * Prevent io from disappearing until this function completes.
+ */
+ crypt_inc_pending(io);
+ crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
+
+ clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
+ if (unlikely(!clone)) {
+ io->error = -EIO;
+ goto dec;
+ }
+
+ io->ctx.bio_out = clone;
+ io->ctx.iter_out = clone->bi_iter;
+
+ sector += bio_sectors(clone);
+
+ crypt_inc_pending(io);
+ r = crypt_convert(cc, &io->ctx);
+ if (r)
+ io->error = -EIO;
+ crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
+
+ /* Encryption was already finished, submit io now */
+ if (crypt_finished) {
+ kcryptd_crypt_write_io_submit(io, 0);
+ io->sector = sector;
+ }
+
+dec:
+ crypt_dec_pending(io);
+}
+
+static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
+{
+ crypt_dec_pending(io);
+}
+
+static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
+{
+ struct crypt_config *cc = io->cc;
+ int r = 0;
+
+ crypt_inc_pending(io);
+
+ crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
+ io->sector);
+
+ r = crypt_convert(cc, &io->ctx);
+ if (r < 0)
+ io->error = -EIO;
+
+ if (atomic_dec_and_test(&io->ctx.cc_pending))
+ kcryptd_crypt_read_done(io);
+
+ crypt_dec_pending(io);
+}
+
+static void kcryptd_async_done(struct crypto_async_request *async_req,
+ int error)
+{
+ struct dm_crypt_request *dmreq = async_req->data;
+ struct convert_context *ctx = dmreq->ctx;
+ struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
+ struct crypt_config *cc = io->cc;
+
+ if (error == -EINPROGRESS) {
+ complete(&ctx->restart);
+ return;
+ }
+
+ if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+ error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+
+ if (error < 0)
+ io->error = -EIO;
+
+ crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
+
+ if (!atomic_dec_and_test(&ctx->cc_pending))
+ return;
+
+ if (bio_data_dir(io->base_bio) == READ)
+ kcryptd_crypt_read_done(io);
+ else
+ kcryptd_crypt_write_io_submit(io, 1);
+}
+
+static void kcryptd_crypt(struct work_struct *work)
+{
+ struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+
+ if (bio_data_dir(io->base_bio) == READ)
+ kcryptd_crypt_read_convert(io);
+ else
+ kcryptd_crypt_write_convert(io);
+}
+
+static void kcryptd_queue_crypt(struct dm_crypt_io *io)
+{
+ struct crypt_config *cc = io->cc;
+
+ INIT_WORK(&io->work, kcryptd_crypt);
+ queue_work(cc->crypt_queue, &io->work);
+}
+
+/*
+ * Decode key from its hex representation
+ */
+static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
+{
+ char buffer[3];
+ unsigned int i;
+
+ buffer[2] = '\0';
+
+ for (i = 0; i < size; i++) {
+ buffer[0] = *hex++;
+ buffer[1] = *hex++;
+
+ if (kstrtou8(buffer, 16, &key[i]))
+ return -EINVAL;
+ }
+
+ if (*hex != '\0')
+ return -EINVAL;
+
+ return 0;
+}
+
+static void crypt_free_tfms(struct crypt_config *cc)
+{
+ unsigned i;
+
+ if (!cc->tfms)
+ return;
+
+ for (i = 0; i < cc->tfms_count; i++)
+ if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
+ crypto_free_ablkcipher(cc->tfms[i]);
+ cc->tfms[i] = NULL;
+ }
+
+ kfree(cc->tfms);
+ cc->tfms = NULL;
+}
+
+static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+{
+ unsigned i;
+ int err;
+
+ cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *),
+ GFP_KERNEL);
+ if (!cc->tfms)
+ return -ENOMEM;
+
+ for (i = 0; i < cc->tfms_count; i++) {
+ cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+ if (IS_ERR(cc->tfms[i])) {
+ err = PTR_ERR(cc->tfms[i]);
+ crypt_free_tfms(cc);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+ unsigned subkey_size;
+ int err = 0, i, r;
+
+ /* Ignore extra keys (which are used for IV etc) */
+ subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+
+ for (i = 0; i < cc->tfms_count; i++) {
+ r = crypto_ablkcipher_setkey(cc->tfms[i],
+ cc->key + (i * subkey_size),
+ subkey_size);
+ if (r)
+ err = r;
+ }
+
+ return err;
+}
+
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+ int r = -EINVAL;
+ int key_string_len = strlen(key);
+
+ /* The key size may not be changed. */
+ if (cc->key_size != (key_string_len >> 1))
+ goto out;
+
+ /* Hyphen (which gives a key_size of zero) means there is no key. */
+ if (!cc->key_size && strcmp(key, "-"))
+ goto out;
+
+ if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
+ goto out;
+
+ set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+
+ r = crypt_setkey_allcpus(cc);
+
+out:
+ /* Hex key string not needed after here, so wipe it. */
+ memset(key, '0', key_string_len);
+
+ return r;
+}
+
+static int crypt_wipe_key(struct crypt_config *cc)
+{
+ clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+ memset(&cc->key, 0, cc->key_size * sizeof(u8));
+
+ return crypt_setkey_allcpus(cc);
+}
+
+static void crypt_dtr(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ ti->private = NULL;
+
+ if (!cc)
+ return;
+
+ if (cc->write_thread)
+ kthread_stop(cc->write_thread);
+
+ if (cc->io_queue)
+ destroy_workqueue(cc->io_queue);
+ if (cc->crypt_queue)
+ destroy_workqueue(cc->crypt_queue);
+
+ crypt_free_tfms(cc);
+
+ if (cc->bs)
+ bioset_free(cc->bs);
+
+ if (cc->page_pool)
+ mempool_destroy(cc->page_pool);
+ if (cc->req_pool)
+ mempool_destroy(cc->req_pool);
+
+ if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
+ cc->iv_gen_ops->dtr(cc);
+
+ if (cc->dev)
+ dm_put_device(ti, cc->dev);
+
+ kzfree(cc->cipher);
+ kzfree(cc->cipher_string);
+
+ /* Must zero key material before freeing */
+ kzfree(cc);
+}
+
+static int crypt_ctr_cipher(struct dm_target *ti,
+ char *cipher_in, char *key)
+{
+ struct crypt_config *cc = ti->private;
+ char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
+ char *cipher_api = NULL;
+ int ret = -EINVAL;
+ char dummy;
+
+ /* Convert to crypto api definition? */
+ if (strchr(cipher_in, '(')) {
+ ti->error = "Bad cipher specification";
+ return -EINVAL;
+ }
+
+ cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+ if (!cc->cipher_string)
+ goto bad_mem;
+
+ /*
+ * Legacy dm-crypt cipher specification
+ * cipher[:keycount]-mode-iv:ivopts
+ */
+ tmp = cipher_in;
+ keycount = strsep(&tmp, "-");
+ cipher = strsep(&keycount, ":");
+
+ if (!keycount)
+ cc->tfms_count = 1;
+ else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
+ !is_power_of_2(cc->tfms_count)) {
+ ti->error = "Bad cipher key count specification";
+ return -EINVAL;
+ }
+ cc->key_parts = cc->tfms_count;
+ cc->key_extra_size = 0;
+
+ cc->cipher = kstrdup(cipher, GFP_KERNEL);
+ if (!cc->cipher)
+ goto bad_mem;
+
+ chainmode = strsep(&tmp, "-");
+ ivopts = strsep(&tmp, "-");
+ ivmode = strsep(&ivopts, ":");
+
+ if (tmp)
+ DMWARN("Ignoring unexpected additional cipher options");
+
+ /*
+ * For compatibility with the original dm-crypt mapping format, if
+ * only the cipher name is supplied, use cbc-plain.
+ */
+ if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
+ chainmode = "cbc";
+ ivmode = "plain";
+ }
+
+ if (strcmp(chainmode, "ecb") && !ivmode) {
+ ti->error = "IV mechanism required";
+ return -EINVAL;
+ }
+
+ cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL);
+ if (!cipher_api)
+ goto bad_mem;
+
+ ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
+ "%s(%s)", chainmode, cipher);
+ if (ret < 0) {
+ kfree(cipher_api);
+ goto bad_mem;
+ }
+
+ /* Allocate cipher */
+ ret = crypt_alloc_tfms(cc, cipher_api);
+ if (ret < 0) {
+ ti->error = "Error allocating crypto tfm";
+ goto bad;
+ }
+
+ /* Initialize IV */
+ cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
+ if (cc->iv_size)
+ /* at least a 64 bit sector number should fit in our buffer */
+ cc->iv_size = max(cc->iv_size,
+ (unsigned int)(sizeof(u64) / sizeof(u8)));
+ else if (ivmode) {
+ DMWARN("Selected cipher does not support IVs");
+ ivmode = NULL;
+ }
+
+ /* Choose ivmode, see comments at iv code. */
+ if (ivmode == NULL)
+ cc->iv_gen_ops = NULL;
+ else if (strcmp(ivmode, "plain") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain_ops;
+ else if (strcmp(ivmode, "plain64") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain64_ops;
+ else if (strcmp(ivmode, "essiv") == 0)
+ cc->iv_gen_ops = &crypt_iv_essiv_ops;
+ else if (strcmp(ivmode, "benbi") == 0)
+ cc->iv_gen_ops = &crypt_iv_benbi_ops;
+ else if (strcmp(ivmode, "null") == 0)
+ cc->iv_gen_ops = &crypt_iv_null_ops;
+ else if (strcmp(ivmode, "lmk") == 0) {
+ cc->iv_gen_ops = &crypt_iv_lmk_ops;
+ /*
+ * Version 2 and 3 is recognised according
+ * to length of provided multi-key string.
+ * If present (version 3), last key is used as IV seed.
+ * All keys (including IV seed) are always the same size.
+ */
+ if (cc->key_size % cc->key_parts) {
+ cc->key_parts++;
+ cc->key_extra_size = cc->key_size / cc->key_parts;
+ }
+ } else if (strcmp(ivmode, "tcw") == 0) {
+ cc->iv_gen_ops = &crypt_iv_tcw_ops;
+ cc->key_parts += 2; /* IV + whitening */
+ cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
+ } else {
+ ret = -EINVAL;
+ ti->error = "Invalid IV mode";
+ goto bad;
+ }
+
+ /* Initialize and set key */
+ ret = crypt_set_key(cc, key);
+ if (ret < 0) {
+ ti->error = "Error decoding and setting key";
+ goto bad;
+ }
+
+ /* Allocate IV */
+ if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) {
+ ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
+ if (ret < 0) {
+ ti->error = "Error creating IV";
+ goto bad;
+ }
+ }
+
+ /* Initialize IV (set keys for ESSIV etc) */
+ if (cc->iv_gen_ops && cc->iv_gen_ops->init) {
+ ret = cc->iv_gen_ops->init(cc);
+ if (ret < 0) {
+ ti->error = "Error initialising IV";
+ goto bad;
+ }
+ }
+
+ ret = 0;
+bad:
+ kfree(cipher_api);
+ return ret;
+
+bad_mem:
+ ti->error = "Cannot allocate cipher strings";
+ return -ENOMEM;
+}
+
+/*
+ * Construct an encryption mapping:
+ * <cipher> <key> <iv_offset> <dev_path> <start>
+ */
+static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct crypt_config *cc;
+ unsigned int key_size, opt_params;
+ unsigned long long tmpll;
+ int ret;
+ size_t iv_size_padding;
+ struct dm_arg_set as;
+ const char *opt_string;
+ char dummy;
+
+ static struct dm_arg _args[] = {
+ {0, 3, "Invalid number of feature args"},
+ };
+
+ if (argc < 5) {
+ ti->error = "Not enough arguments";
+ return -EINVAL;
+ }
+
+ key_size = strlen(argv[1]) >> 1;
+
+ cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+ if (!cc) {
+ ti->error = "Cannot allocate encryption context";
+ return -ENOMEM;
+ }
+ cc->key_size = key_size;
+
+ ti->private = cc;
+ ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
+ if (ret < 0)
+ goto bad;
+
+ cc->dmreq_start = sizeof(struct ablkcipher_request);
+ cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
+ cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
+
+ if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+ /* Allocate the padding exactly */
+ iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
+ & crypto_ablkcipher_alignmask(any_tfm(cc));
+ } else {
+ /*
+ * If the cipher requires greater alignment than kmalloc
+ * alignment, we don't know the exact position of the
+ * initialization vector. We must assume worst case.
+ */
+ iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
+ }
+
+ ret = -ENOMEM;
+ cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
+ sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
+ if (!cc->req_pool) {
+ ti->error = "Cannot allocate crypt request mempool";
+ goto bad;
+ }
+
+ cc->per_bio_data_size = ti->per_bio_data_size =
+ ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
+ sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
+ ARCH_KMALLOC_MINALIGN);
+
+ cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
+ if (!cc->page_pool) {
+ ti->error = "Cannot allocate page mempool";
+ goto bad;
+ }
+
+ cc->bs = bioset_create(MIN_IOS, 0);
+ if (!cc->bs) {
+ ti->error = "Cannot allocate crypt bioset";
+ goto bad;
+ }
+
+ mutex_init(&cc->bio_alloc_lock);
+
+ ret = -EINVAL;
+ if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
+ ti->error = "Invalid iv_offset sector";
+ goto bad;
+ }
+ cc->iv_offset = tmpll;
+
+ if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) {
+ ti->error = "Device lookup failed";
+ goto bad;
+ }
+
+ if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
+ ti->error = "Invalid device sector";
+ goto bad;
+ }
+ cc->start = tmpll;
+
+ argv += 5;
+ argc -= 5;
+
+ /* Optional parameters */
+ if (argc) {
+ as.argc = argc;
+ as.argv = argv;
+
+ ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+ if (ret)
+ goto bad;
+
+ ret = -EINVAL;
+ while (opt_params--) {
+ opt_string = dm_shift_arg(&as);
+ if (!opt_string) {
+ ti->error = "Not enough feature arguments";
+ goto bad;
+ }
+
+ if (!strcasecmp(opt_string, "allow_discards"))
+ ti->num_discard_bios = 1;
+
+ else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+ set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+
+ else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+ set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+
+ else {
+ ti->error = "Invalid feature arguments";
+ goto bad;
+ }
+ }
+ }
+
+ ret = -ENOMEM;
+ cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
+ if (!cc->io_queue) {
+ ti->error = "Couldn't create kcryptd io queue";
+ goto bad;
+ }
+
+ if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ else
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ num_online_cpus());
+ if (!cc->crypt_queue) {
+ ti->error = "Couldn't create kcryptd queue";
+ goto bad;
+ }
+
+ init_waitqueue_head(&cc->write_thread_wait);
+ cc->write_tree = RB_ROOT;
+
+ cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+ if (IS_ERR(cc->write_thread)) {
+ ret = PTR_ERR(cc->write_thread);
+ cc->write_thread = NULL;
+ ti->error = "Couldn't spawn write thread";
+ goto bad;
+ }
+ wake_up_process(cc->write_thread);
+
+ ti->num_flush_bios = 1;
+ ti->discard_zeroes_data_unsupported = true;
+
+ return 0;
+
+bad:
+ crypt_dtr(ti);
+ return ret;
+}
+
+static int crypt_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dm_crypt_io *io;
+ struct crypt_config *cc = ti->private;
+
+ /*
+ * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
+ * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
+ * - for REQ_DISCARD caller must use flush if IO ordering matters
+ */
+ if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
+ bio->bi_bdev = cc->dev->bdev;
+ if (bio_sectors(bio))
+ bio->bi_iter.bi_sector = cc->start +
+ dm_target_offset(ti, bio->bi_iter.bi_sector);
+ return DM_MAPIO_REMAPPED;
+ }
+
+ io = dm_per_bio_data(bio, cc->per_bio_data_size);
+ crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+ io->ctx.req = (struct ablkcipher_request *)(io + 1);
+
+ if (bio_data_dir(io->base_bio) == READ) {
+ if (kcryptd_io_read(io, GFP_NOWAIT))
+ kcryptd_queue_read(io);
+ } else
+ kcryptd_queue_crypt(io);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+static void crypt_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct crypt_config *cc = ti->private;
+ unsigned i, sz = 0;
+ int num_feature_args = 0;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s ", cc->cipher_string);
+
+ if (cc->key_size > 0)
+ for (i = 0; i < cc->key_size; i++)
+ DMEMIT("%02x", cc->key[i]);
+ else
+ DMEMIT("-");
+
+ DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
+ cc->dev->name, (unsigned long long)cc->start);
+
+ num_feature_args += !!ti->num_discard_bios;
+ num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+ num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ if (num_feature_args) {
+ DMEMIT(" %d", num_feature_args);
+ if (ti->num_discard_bios)
+ DMEMIT(" allow_discards");
+ if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+ DMEMIT(" same_cpu_crypt");
+ if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
+ DMEMIT(" submit_from_crypt_cpus");
+ }
+
+ break;
+ }
+}
+
+static void crypt_postsuspend(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ set_bit(DM_CRYPT_SUSPENDED, &cc->flags);
+}
+
+static int crypt_preresume(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) {
+ DMERR("aborting resume - crypt key is not set.");
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void crypt_resume(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ clear_bit(DM_CRYPT_SUSPENDED, &cc->flags);
+}
+
+/* Message interface
+ * key set <key>
+ * key wipe
+ */
+static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct crypt_config *cc = ti->private;
+ int ret = -EINVAL;
+
+ if (argc < 2)
+ goto error;
+
+ if (!strcasecmp(argv[0], "key")) {
+ if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
+ DMWARN("not suspended during key manipulation.");
+ return -EINVAL;
+ }
+ if (argc == 3 && !strcasecmp(argv[1], "set")) {
+ ret = crypt_set_key(cc, argv[2]);
+ if (ret)
+ return ret;
+ if (cc->iv_gen_ops && cc->iv_gen_ops->init)
+ ret = cc->iv_gen_ops->init(cc);
+ return ret;
+ }
+ if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
+ if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
+ ret = cc->iv_gen_ops->wipe(cc);
+ if (ret)
+ return ret;
+ }
+ return crypt_wipe_key(cc);
+ }
+ }
+
+error:
+ DMWARN("unrecognised message received.");
+ return -EINVAL;
+}
+
+static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct crypt_config *cc = ti->private;
+ struct request_queue *q = bdev_get_queue(cc->dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = cc->dev->bdev;
+ bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int crypt_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct crypt_config *cc = ti->private;
+
+ return fn(ti, cc->dev, cc->start, ti->len, data);
+}
+
+static struct target_type crypt_target = {
+ .name = "crypt",
+ .version = {1, 14, 0},
+ .module = THIS_MODULE,
+ .ctr = crypt_ctr,
+ .dtr = crypt_dtr,
+ .map = crypt_map,
+ .status = crypt_status,
+ .postsuspend = crypt_postsuspend,
+ .preresume = crypt_preresume,
+ .resume = crypt_resume,
+ .message = crypt_message,
+ .merge = crypt_merge,
+ .iterate_devices = crypt_iterate_devices,
+};
+
+static int __init dm_crypt_init(void)
+{
+ int r;
+
+ r = dm_register_target(&crypt_target);
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_crypt_exit(void)
+{
+ dm_unregister_target(&crypt_target);
+}
+
+module_init(dm_crypt_init);
+module_exit(dm_crypt_exit);
+
+MODULE_AUTHOR("Jana Saout <jana@saout.de>");
+MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
new file mode 100644
index 000000000..57b6a1901
--- /dev/null
+++ b/drivers/md/dm-delay.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (C) 2005-2007 Red Hat GmbH
+ *
+ * A target that delays reads and/or writes and can send
+ * them to different devices.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "delay"
+
+struct delay_c {
+ struct timer_list delay_timer;
+ struct mutex timer_lock;
+ struct workqueue_struct *kdelayd_wq;
+ struct work_struct flush_expired_bios;
+ struct list_head delayed_bios;
+ atomic_t may_delay;
+
+ struct dm_dev *dev_read;
+ sector_t start_read;
+ unsigned read_delay;
+ unsigned reads;
+
+ struct dm_dev *dev_write;
+ sector_t start_write;
+ unsigned write_delay;
+ unsigned writes;
+};
+
+struct dm_delay_info {
+ struct delay_c *context;
+ struct list_head list;
+ unsigned long expires;
+};
+
+static DEFINE_MUTEX(delayed_bios_lock);
+
+static void handle_delayed_timer(unsigned long data)
+{
+ struct delay_c *dc = (struct delay_c *)data;
+
+ queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
+}
+
+static void queue_timeout(struct delay_c *dc, unsigned long expires)
+{
+ mutex_lock(&dc->timer_lock);
+
+ if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
+ mod_timer(&dc->delay_timer, expires);
+
+ mutex_unlock(&dc->timer_lock);
+}
+
+static void flush_bios(struct bio *bio)
+{
+ struct bio *n;
+
+ while (bio) {
+ n = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = n;
+ }
+}
+
+static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
+{
+ struct dm_delay_info *delayed, *next;
+ unsigned long next_expires = 0;
+ int start_timer = 0;
+ struct bio_list flush_bios = { };
+
+ mutex_lock(&delayed_bios_lock);
+ list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+ if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+ struct bio *bio = dm_bio_from_per_bio_data(delayed,
+ sizeof(struct dm_delay_info));
+ list_del(&delayed->list);
+ bio_list_add(&flush_bios, bio);
+ if ((bio_data_dir(bio) == WRITE))
+ delayed->context->writes--;
+ else
+ delayed->context->reads--;
+ continue;
+ }
+
+ if (!start_timer) {
+ start_timer = 1;
+ next_expires = delayed->expires;
+ } else
+ next_expires = min(next_expires, delayed->expires);
+ }
+
+ mutex_unlock(&delayed_bios_lock);
+
+ if (start_timer)
+ queue_timeout(dc, next_expires);
+
+ return bio_list_get(&flush_bios);
+}
+
+static void flush_expired_bios(struct work_struct *work)
+{
+ struct delay_c *dc;
+
+ dc = container_of(work, struct delay_c, flush_expired_bios);
+ flush_bios(flush_delayed_bios(dc, 0));
+}
+
+/*
+ * Mapping parameters:
+ * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
+ *
+ * With separate write parameters, the first set is only used for reads.
+ * Delays are specified in milliseconds.
+ */
+static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct delay_c *dc;
+ unsigned long long tmpll;
+ char dummy;
+
+ if (argc != 3 && argc != 6) {
+ ti->error = "requires exactly 3 or 6 arguments";
+ return -EINVAL;
+ }
+
+ dc = kmalloc(sizeof(*dc), GFP_KERNEL);
+ if (!dc) {
+ ti->error = "Cannot allocate context";
+ return -ENOMEM;
+ }
+
+ dc->reads = dc->writes = 0;
+
+ if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
+ ti->error = "Invalid device sector";
+ goto bad;
+ }
+ dc->start_read = tmpll;
+
+ if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
+ ti->error = "Invalid delay";
+ goto bad;
+ }
+
+ if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ &dc->dev_read)) {
+ ti->error = "Device lookup failed";
+ goto bad;
+ }
+
+ dc->dev_write = NULL;
+ if (argc == 3)
+ goto out;
+
+ if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
+ ti->error = "Invalid write device sector";
+ goto bad_dev_read;
+ }
+ dc->start_write = tmpll;
+
+ if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
+ ti->error = "Invalid write delay";
+ goto bad_dev_read;
+ }
+
+ if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table),
+ &dc->dev_write)) {
+ ti->error = "Write device lookup failed";
+ goto bad_dev_read;
+ }
+
+out:
+ dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
+ if (!dc->kdelayd_wq) {
+ DMERR("Couldn't start kdelayd");
+ goto bad_queue;
+ }
+
+ setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
+
+ INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
+ INIT_LIST_HEAD(&dc->delayed_bios);
+ mutex_init(&dc->timer_lock);
+ atomic_set(&dc->may_delay, 1);
+
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
+ ti->per_bio_data_size = sizeof(struct dm_delay_info);
+ ti->private = dc;
+ return 0;
+
+bad_queue:
+ if (dc->dev_write)
+ dm_put_device(ti, dc->dev_write);
+bad_dev_read:
+ dm_put_device(ti, dc->dev_read);
+bad:
+ kfree(dc);
+ return -EINVAL;
+}
+
+static void delay_dtr(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ destroy_workqueue(dc->kdelayd_wq);
+
+ dm_put_device(ti, dc->dev_read);
+
+ if (dc->dev_write)
+ dm_put_device(ti, dc->dev_write);
+
+ kfree(dc);
+}
+
+static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
+{
+ struct dm_delay_info *delayed;
+ unsigned long expires = 0;
+
+ if (!delay || !atomic_read(&dc->may_delay))
+ return 1;
+
+ delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
+
+ delayed->context = dc;
+ delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
+
+ mutex_lock(&delayed_bios_lock);
+
+ if (bio_data_dir(bio) == WRITE)
+ dc->writes++;
+ else
+ dc->reads++;
+
+ list_add_tail(&delayed->list, &dc->delayed_bios);
+
+ mutex_unlock(&delayed_bios_lock);
+
+ queue_timeout(dc, expires);
+
+ return 0;
+}
+
+static void delay_presuspend(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ atomic_set(&dc->may_delay, 0);
+ del_timer_sync(&dc->delay_timer);
+ flush_bios(flush_delayed_bios(dc, 1));
+}
+
+static void delay_resume(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ atomic_set(&dc->may_delay, 1);
+}
+
+static int delay_map(struct dm_target *ti, struct bio *bio)
+{
+ struct delay_c *dc = ti->private;
+
+ if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
+ bio->bi_bdev = dc->dev_write->bdev;
+ if (bio_sectors(bio))
+ bio->bi_iter.bi_sector = dc->start_write +
+ dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ return delay_bio(dc, dc->write_delay, bio);
+ }
+
+ bio->bi_bdev = dc->dev_read->bdev;
+ bio->bi_iter.bi_sector = dc->start_read +
+ dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ return delay_bio(dc, dc->read_delay, bio);
+}
+
+static void delay_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct delay_c *dc = ti->private;
+ int sz = 0;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%u %u", dc->reads, dc->writes);
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %llu %u", dc->dev_read->name,
+ (unsigned long long) dc->start_read,
+ dc->read_delay);
+ if (dc->dev_write)
+ DMEMIT(" %s %llu %u", dc->dev_write->name,
+ (unsigned long long) dc->start_write,
+ dc->write_delay);
+ break;
+ }
+}
+
+static int delay_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct delay_c *dc = ti->private;
+ int ret = 0;
+
+ ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
+ if (ret)
+ goto out;
+
+ if (dc->dev_write)
+ ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
+
+out:
+ return ret;
+}
+
+static struct target_type delay_target = {
+ .name = "delay",
+ .version = {1, 2, 1},
+ .module = THIS_MODULE,
+ .ctr = delay_ctr,
+ .dtr = delay_dtr,
+ .map = delay_map,
+ .presuspend = delay_presuspend,
+ .resume = delay_resume,
+ .status = delay_status,
+ .iterate_devices = delay_iterate_devices,
+};
+
+static int __init dm_delay_init(void)
+{
+ int r;
+
+ r = dm_register_target(&delay_target);
+ if (r < 0) {
+ DMERR("register failed %d", r);
+ goto bad_register;
+ }
+
+ return 0;
+
+bad_register:
+ return r;
+}
+
+static void __exit dm_delay_exit(void)
+{
+ dm_unregister_target(&delay_target);
+}
+
+/* Module hooks */
+module_init(dm_delay_init);
+module_exit(dm_delay_exit);
+
+MODULE_DESCRIPTION(DM_NAME " delay target");
+MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
new file mode 100644
index 000000000..ad913cd4a
--- /dev/null
+++ b/drivers/md/dm-era-target.c
@@ -0,0 +1,1747 @@
+#include "dm.h"
+#include "persistent-data/dm-transaction-manager.h"
+#include "persistent-data/dm-bitset.h"
+#include "persistent-data/dm-space-map.h"
+
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "era"
+
+#define SUPERBLOCK_LOCATION 0
+#define SUPERBLOCK_MAGIC 2126579579
+#define SUPERBLOCK_CSUM_XOR 146538381
+#define MIN_ERA_VERSION 1
+#define MAX_ERA_VERSION 1
+#define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION
+#define MIN_BLOCK_SIZE 8
+
+/*----------------------------------------------------------------
+ * Writeset
+ *--------------------------------------------------------------*/
+struct writeset_metadata {
+ uint32_t nr_bits;
+ dm_block_t root;
+};
+
+struct writeset {
+ struct writeset_metadata md;
+
+ /*
+ * An in core copy of the bits to save constantly doing look ups on
+ * disk.
+ */
+ unsigned long *bits;
+};
+
+/*
+ * This does not free off the on disk bitset as this will normally be done
+ * after digesting into the era array.
+ */
+static void writeset_free(struct writeset *ws)
+{
+ vfree(ws->bits);
+}
+
+static int setup_on_disk_bitset(struct dm_disk_bitset *info,
+ unsigned nr_bits, dm_block_t *root)
+{
+ int r;
+
+ r = dm_bitset_empty(info, root);
+ if (r)
+ return r;
+
+ return dm_bitset_resize(info, *root, 0, nr_bits, false, root);
+}
+
+static size_t bitset_size(unsigned nr_bits)
+{
+ return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG);
+}
+
+/*
+ * Allocates memory for the in core bitset.
+ */
+static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
+{
+ ws->md.nr_bits = nr_blocks;
+ ws->md.root = INVALID_WRITESET_ROOT;
+ ws->bits = vzalloc(bitset_size(nr_blocks));
+ if (!ws->bits) {
+ DMERR("%s: couldn't allocate in memory bitset", __func__);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Wipes the in-core bitset, and creates a new on disk bitset.
+ */
+static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws)
+{
+ int r;
+
+ memset(ws->bits, 0, bitset_size(ws->md.nr_bits));
+
+ r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root);
+ if (r) {
+ DMERR("%s: setup_on_disk_bitset failed", __func__);
+ return r;
+ }
+
+ return 0;
+}
+
+static bool writeset_marked(struct writeset *ws, dm_block_t block)
+{
+ return test_bit(block, ws->bits);
+}
+
+static int writeset_marked_on_disk(struct dm_disk_bitset *info,
+ struct writeset_metadata *m, dm_block_t block,
+ bool *result)
+{
+ dm_block_t old = m->root;
+
+ /*
+ * The bitset was flushed when it was archived, so we know there'll
+ * be no change to the root.
+ */
+ int r = dm_bitset_test_bit(info, m->root, block, &m->root, result);
+ if (r) {
+ DMERR("%s: dm_bitset_test_bit failed", __func__);
+ return r;
+ }
+
+ BUG_ON(m->root != old);
+
+ return r;
+}
+
+/*
+ * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was.
+ */
+static int writeset_test_and_set(struct dm_disk_bitset *info,
+ struct writeset *ws, uint32_t block)
+{
+ int r;
+
+ if (!test_and_set_bit(block, ws->bits)) {
+ r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root);
+ if (r) {
+ /* FIXME: fail mode */
+ return r;
+ }
+
+ return 0;
+ }
+
+ return 1;
+}
+
+/*----------------------------------------------------------------
+ * On disk metadata layout
+ *--------------------------------------------------------------*/
+#define SPACE_MAP_ROOT_SIZE 128
+#define UUID_LEN 16
+
+struct writeset_disk {
+ __le32 nr_bits;
+ __le64 root;
+} __packed;
+
+struct superblock_disk {
+ __le32 csum;
+ __le32 flags;
+ __le64 blocknr;
+
+ __u8 uuid[UUID_LEN];
+ __le64 magic;
+ __le32 version;
+
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+ __le32 data_block_size;
+ __le32 metadata_block_size;
+ __le32 nr_blocks;
+
+ __le32 current_era;
+ struct writeset_disk current_writeset;
+
+ /*
+ * Only these two fields are valid within the metadata snapshot.
+ */
+ __le64 writeset_tree_root;
+ __le64 era_array_root;
+
+ __le64 metadata_snap;
+} __packed;
+
+/*----------------------------------------------------------------
+ * Superblock validation
+ *--------------------------------------------------------------*/
+static void sb_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t sb_block_size)
+{
+ struct superblock_disk *disk = dm_block_data(b);
+
+ disk->blocknr = cpu_to_le64(dm_block_location(b));
+ disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags,
+ sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR));
+}
+
+static int check_metadata_version(struct superblock_disk *disk)
+{
+ uint32_t metadata_version = le32_to_cpu(disk->version);
+ if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) {
+ DMERR("Era metadata version %u found, but only versions between %u and %u supported.",
+ metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int sb_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t sb_block_size)
+{
+ struct superblock_disk *disk = dm_block_data(b);
+ __le32 csum_le;
+
+ if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) {
+ DMERR("sb_check failed: blocknr %llu: wanted %llu",
+ le64_to_cpu(disk->blocknr),
+ (unsigned long long)dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) {
+ DMERR("sb_check failed: magic %llu: wanted %llu",
+ le64_to_cpu(disk->magic),
+ (unsigned long long) SUPERBLOCK_MAGIC);
+ return -EILSEQ;
+ }
+
+ csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags,
+ sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR));
+ if (csum_le != disk->csum) {
+ DMERR("sb_check failed: csum %u: wanted %u",
+ le32_to_cpu(csum_le), le32_to_cpu(disk->csum));
+ return -EILSEQ;
+ }
+
+ return check_metadata_version(disk);
+}
+
+static struct dm_block_validator sb_validator = {
+ .name = "superblock",
+ .prepare_for_write = sb_prepare_for_write,
+ .check = sb_check
+};
+
+/*----------------------------------------------------------------
+ * Low level metadata handling
+ *--------------------------------------------------------------*/
+#define DM_ERA_METADATA_BLOCK_SIZE 4096
+#define DM_ERA_METADATA_CACHE_SIZE 64
+#define ERA_MAX_CONCURRENT_LOCKS 5
+
+struct era_metadata {
+ struct block_device *bdev;
+ struct dm_block_manager *bm;
+ struct dm_space_map *sm;
+ struct dm_transaction_manager *tm;
+
+ dm_block_t block_size;
+ uint32_t nr_blocks;
+
+ uint32_t current_era;
+
+ /*
+ * We preallocate 2 writesets. When an era rolls over we
+ * switch between them. This means the allocation is done at
+ * preresume time, rather than on the io path.
+ */
+ struct writeset writesets[2];
+ struct writeset *current_writeset;
+
+ dm_block_t writeset_tree_root;
+ dm_block_t era_array_root;
+
+ struct dm_disk_bitset bitset_info;
+ struct dm_btree_info writeset_tree_info;
+ struct dm_array_info era_array_info;
+
+ dm_block_t metadata_snap;
+
+ /*
+ * A flag that is set whenever a writeset has been archived.
+ */
+ bool archived_writesets;
+
+ /*
+ * Reading the space map root can fail, so we read it into this
+ * buffer before the superblock is locked and updated.
+ */
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+};
+
+static int superblock_read_lock(struct era_metadata *md,
+ struct dm_block **sblock)
+{
+ return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+static int superblock_lock_zero(struct era_metadata *md,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+static int superblock_lock(struct era_metadata *md,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+/* FIXME: duplication with cache and thin */
+static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
+{
+ int r;
+ unsigned i;
+ struct dm_block *b;
+ __le64 *data_le, zero = cpu_to_le64(0);
+ unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
+
+ /*
+ * We can't use a validator here - it may be all zeroes.
+ */
+ r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b);
+ if (r)
+ return r;
+
+ data_le = dm_block_data(b);
+ *result = true;
+ for (i = 0; i < sb_block_size; i++) {
+ if (data_le[i] != zero) {
+ *result = false;
+ break;
+ }
+ }
+
+ return dm_bm_unlock(b);
+}
+
+/*----------------------------------------------------------------*/
+
+static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk)
+{
+ disk->nr_bits = cpu_to_le32(core->nr_bits);
+ disk->root = cpu_to_le64(core->root);
+}
+
+static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core)
+{
+ core->nr_bits = le32_to_cpu(disk->nr_bits);
+ core->root = le64_to_cpu(disk->root);
+}
+
+static void ws_inc(void *context, const void *value)
+{
+ struct era_metadata *md = context;
+ struct writeset_disk ws_d;
+ dm_block_t b;
+
+ memcpy(&ws_d, value, sizeof(ws_d));
+ b = le64_to_cpu(ws_d.root);
+
+ dm_tm_inc(md->tm, b);
+}
+
+static void ws_dec(void *context, const void *value)
+{
+ struct era_metadata *md = context;
+ struct writeset_disk ws_d;
+ dm_block_t b;
+
+ memcpy(&ws_d, value, sizeof(ws_d));
+ b = le64_to_cpu(ws_d.root);
+
+ dm_bitset_del(&md->bitset_info, b);
+}
+
+static int ws_eq(void *context, const void *value1, const void *value2)
+{
+ return !memcmp(value1, value2, sizeof(struct writeset_metadata));
+}
+
+/*----------------------------------------------------------------*/
+
+static void setup_writeset_tree_info(struct era_metadata *md)
+{
+ struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type;
+ md->writeset_tree_info.tm = md->tm;
+ md->writeset_tree_info.levels = 1;
+ vt->context = md;
+ vt->size = sizeof(struct writeset_disk);
+ vt->inc = ws_inc;
+ vt->dec = ws_dec;
+ vt->equal = ws_eq;
+}
+
+static void setup_era_array_info(struct era_metadata *md)
+
+{
+ struct dm_btree_value_type vt;
+ vt.context = NULL;
+ vt.size = sizeof(__le32);
+ vt.inc = NULL;
+ vt.dec = NULL;
+ vt.equal = NULL;
+
+ dm_array_info_init(&md->era_array_info, md->tm, &vt);
+}
+
+static void setup_infos(struct era_metadata *md)
+{
+ dm_disk_bitset_init(md->tm, &md->bitset_info);
+ setup_writeset_tree_info(md);
+ setup_era_array_info(md);
+}
+
+/*----------------------------------------------------------------*/
+
+static int create_fresh_metadata(struct era_metadata *md)
+{
+ int r;
+
+ r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION,
+ &md->tm, &md->sm);
+ if (r < 0) {
+ DMERR("dm_tm_create_with_sm failed");
+ return r;
+ }
+
+ setup_infos(md);
+
+ r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root);
+ if (r) {
+ DMERR("couldn't create new writeset tree");
+ goto bad;
+ }
+
+ r = dm_array_empty(&md->era_array_info, &md->era_array_root);
+ if (r) {
+ DMERR("couldn't create era array");
+ goto bad;
+ }
+
+ return 0;
+
+bad:
+ dm_sm_destroy(md->sm);
+ dm_tm_destroy(md->tm);
+
+ return r;
+}
+
+static int save_sm_root(struct era_metadata *md)
+{
+ int r;
+ size_t metadata_len;
+
+ r = dm_sm_root_size(md->sm, &metadata_len);
+ if (r < 0)
+ return r;
+
+ return dm_sm_copy_root(md->sm, &md->metadata_space_map_root,
+ metadata_len);
+}
+
+static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk)
+{
+ memcpy(&disk->metadata_space_map_root,
+ &md->metadata_space_map_root,
+ sizeof(md->metadata_space_map_root));
+}
+
+/*
+ * Writes a superblock, including the static fields that don't get updated
+ * with every commit (possible optimisation here). 'md' should be fully
+ * constructed when this is called.
+ */
+static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk)
+{
+ disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
+ disk->flags = cpu_to_le32(0ul);
+
+ /* FIXME: can't keep blanking the uuid (uuid is currently unused though) */
+ memset(disk->uuid, 0, sizeof(disk->uuid));
+ disk->version = cpu_to_le32(MAX_ERA_VERSION);
+
+ copy_sm_root(md, disk);
+
+ disk->data_block_size = cpu_to_le32(md->block_size);
+ disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+ disk->nr_blocks = cpu_to_le32(md->nr_blocks);
+ disk->current_era = cpu_to_le32(md->current_era);
+
+ ws_pack(&md->current_writeset->md, &disk->current_writeset);
+ disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root);
+ disk->era_array_root = cpu_to_le64(md->era_array_root);
+ disk->metadata_snap = cpu_to_le64(md->metadata_snap);
+}
+
+static int write_superblock(struct era_metadata *md)
+{
+ int r;
+ struct dm_block *sblock;
+ struct superblock_disk *disk;
+
+ r = save_sm_root(md);
+ if (r) {
+ DMERR("%s: save_sm_root failed", __func__);
+ return r;
+ }
+
+ r = superblock_lock_zero(md, &sblock);
+ if (r)
+ return r;
+
+ disk = dm_block_data(sblock);
+ prepare_superblock(md, disk);
+
+ return dm_tm_commit(md->tm, sblock);
+}
+
+/*
+ * Assumes block_size and the infos are set.
+ */
+static int format_metadata(struct era_metadata *md)
+{
+ int r;
+
+ r = create_fresh_metadata(md);
+ if (r)
+ return r;
+
+ r = write_superblock(md);
+ if (r) {
+ dm_sm_destroy(md->sm);
+ dm_tm_destroy(md->tm);
+ return r;
+ }
+
+ return 0;
+}
+
+static int open_metadata(struct era_metadata *md)
+{
+ int r;
+ struct dm_block *sblock;
+ struct superblock_disk *disk;
+
+ r = superblock_read_lock(md, &sblock);
+ if (r) {
+ DMERR("couldn't read_lock superblock");
+ return r;
+ }
+
+ disk = dm_block_data(sblock);
+ r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION,
+ disk->metadata_space_map_root,
+ sizeof(disk->metadata_space_map_root),
+ &md->tm, &md->sm);
+ if (r) {
+ DMERR("dm_tm_open_with_sm failed");
+ goto bad;
+ }
+
+ setup_infos(md);
+
+ md->block_size = le32_to_cpu(disk->data_block_size);
+ md->nr_blocks = le32_to_cpu(disk->nr_blocks);
+ md->current_era = le32_to_cpu(disk->current_era);
+
+ md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root);
+ md->era_array_root = le64_to_cpu(disk->era_array_root);
+ md->metadata_snap = le64_to_cpu(disk->metadata_snap);
+ md->archived_writesets = true;
+
+ return dm_bm_unlock(sblock);
+
+bad:
+ dm_bm_unlock(sblock);
+ return r;
+}
+
+static int open_or_format_metadata(struct era_metadata *md,
+ bool may_format)
+{
+ int r;
+ bool unformatted = false;
+
+ r = superblock_all_zeroes(md->bm, &unformatted);
+ if (r)
+ return r;
+
+ if (unformatted)
+ return may_format ? format_metadata(md) : -EPERM;
+
+ return open_metadata(md);
+}
+
+static int create_persistent_data_objects(struct era_metadata *md,
+ bool may_format)
+{
+ int r;
+
+ md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
+ DM_ERA_METADATA_CACHE_SIZE,
+ ERA_MAX_CONCURRENT_LOCKS);
+ if (IS_ERR(md->bm)) {
+ DMERR("could not create block manager");
+ return PTR_ERR(md->bm);
+ }
+
+ r = open_or_format_metadata(md, may_format);
+ if (r)
+ dm_block_manager_destroy(md->bm);
+
+ return r;
+}
+
+static void destroy_persistent_data_objects(struct era_metadata *md)
+{
+ dm_sm_destroy(md->sm);
+ dm_tm_destroy(md->tm);
+ dm_block_manager_destroy(md->bm);
+}
+
+/*
+ * This waits until all era_map threads have picked up the new filter.
+ */
+static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset)
+{
+ rcu_assign_pointer(md->current_writeset, new_writeset);
+ synchronize_rcu();
+}
+
+/*----------------------------------------------------------------
+ * Writesets get 'digested' into the main era array.
+ *
+ * We're using a coroutine here so the worker thread can do the digestion,
+ * thus avoiding synchronisation of the metadata. Digesting a whole
+ * writeset in one go would cause too much latency.
+ *--------------------------------------------------------------*/
+struct digest {
+ uint32_t era;
+ unsigned nr_bits, current_bit;
+ struct writeset_metadata writeset;
+ __le32 value;
+ struct dm_disk_bitset info;
+
+ int (*step)(struct era_metadata *, struct digest *);
+};
+
+static int metadata_digest_lookup_writeset(struct era_metadata *md,
+ struct digest *d);
+
+static int metadata_digest_remove_writeset(struct era_metadata *md,
+ struct digest *d)
+{
+ int r;
+ uint64_t key = d->era;
+
+ r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root,
+ &key, &md->writeset_tree_root);
+ if (r) {
+ DMERR("%s: dm_btree_remove failed", __func__);
+ return r;
+ }
+
+ d->step = metadata_digest_lookup_writeset;
+ return 0;
+}
+
+#define INSERTS_PER_STEP 100
+
+static int metadata_digest_transcribe_writeset(struct era_metadata *md,
+ struct digest *d)
+{
+ int r;
+ bool marked;
+ unsigned b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits);
+
+ for (b = d->current_bit; b < e; b++) {
+ r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked);
+ if (r) {
+ DMERR("%s: writeset_marked_on_disk failed", __func__);
+ return r;
+ }
+
+ if (!marked)
+ continue;
+
+ __dm_bless_for_disk(&d->value);
+ r = dm_array_set_value(&md->era_array_info, md->era_array_root,
+ b, &d->value, &md->era_array_root);
+ if (r) {
+ DMERR("%s: dm_array_set_value failed", __func__);
+ return r;
+ }
+ }
+
+ if (b == d->nr_bits)
+ d->step = metadata_digest_remove_writeset;
+ else
+ d->current_bit = b;
+
+ return 0;
+}
+
+static int metadata_digest_lookup_writeset(struct era_metadata *md,
+ struct digest *d)
+{
+ int r;
+ uint64_t key;
+ struct writeset_disk disk;
+
+ r = dm_btree_find_lowest_key(&md->writeset_tree_info,
+ md->writeset_tree_root, &key);
+ if (r < 0)
+ return r;
+
+ d->era = key;
+
+ r = dm_btree_lookup(&md->writeset_tree_info,
+ md->writeset_tree_root, &key, &disk);
+ if (r) {
+ if (r == -ENODATA) {
+ d->step = NULL;
+ return 0;
+ }
+
+ DMERR("%s: dm_btree_lookup failed", __func__);
+ return r;
+ }
+
+ ws_unpack(&disk, &d->writeset);
+ d->value = cpu_to_le32(key);
+
+ d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks);
+ d->current_bit = 0;
+ d->step = metadata_digest_transcribe_writeset;
+
+ return 0;
+}
+
+static int metadata_digest_start(struct era_metadata *md, struct digest *d)
+{
+ if (d->step)
+ return 0;
+
+ memset(d, 0, sizeof(*d));
+
+ /*
+ * We initialise another bitset info to avoid any caching side
+ * effects with the previous one.
+ */
+ dm_disk_bitset_init(md->tm, &d->info);
+ d->step = metadata_digest_lookup_writeset;
+
+ return 0;
+}
+
+/*----------------------------------------------------------------
+ * High level metadata interface. Target methods should use these, and not
+ * the lower level ones.
+ *--------------------------------------------------------------*/
+static struct era_metadata *metadata_open(struct block_device *bdev,
+ sector_t block_size,
+ bool may_format)
+{
+ int r;
+ struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL);
+
+ if (!md)
+ return NULL;
+
+ md->bdev = bdev;
+ md->block_size = block_size;
+
+ md->writesets[0].md.root = INVALID_WRITESET_ROOT;
+ md->writesets[1].md.root = INVALID_WRITESET_ROOT;
+ md->current_writeset = &md->writesets[0];
+
+ r = create_persistent_data_objects(md, may_format);
+ if (r) {
+ kfree(md);
+ return ERR_PTR(r);
+ }
+
+ return md;
+}
+
+static void metadata_close(struct era_metadata *md)
+{
+ destroy_persistent_data_objects(md);
+ kfree(md);
+}
+
+static bool valid_nr_blocks(dm_block_t n)
+{
+ /*
+ * dm_bitset restricts us to 2^32. test_bit & co. restrict us
+ * further to 2^31 - 1
+ */
+ return n < (1ull << 31);
+}
+
+static int metadata_resize(struct era_metadata *md, void *arg)
+{
+ int r;
+ dm_block_t *new_size = arg;
+ __le32 value;
+
+ if (!valid_nr_blocks(*new_size)) {
+ DMERR("Invalid number of origin blocks %llu",
+ (unsigned long long) *new_size);
+ return -EINVAL;
+ }
+
+ writeset_free(&md->writesets[0]);
+ writeset_free(&md->writesets[1]);
+
+ r = writeset_alloc(&md->writesets[0], *new_size);
+ if (r) {
+ DMERR("%s: writeset_alloc failed for writeset 0", __func__);
+ return r;
+ }
+
+ r = writeset_alloc(&md->writesets[1], *new_size);
+ if (r) {
+ DMERR("%s: writeset_alloc failed for writeset 1", __func__);
+ return r;
+ }
+
+ value = cpu_to_le32(0u);
+ __dm_bless_for_disk(&value);
+ r = dm_array_resize(&md->era_array_info, md->era_array_root,
+ md->nr_blocks, *new_size,
+ &value, &md->era_array_root);
+ if (r) {
+ DMERR("%s: dm_array_resize failed", __func__);
+ return r;
+ }
+
+ md->nr_blocks = *new_size;
+ return 0;
+}
+
+static int metadata_era_archive(struct era_metadata *md)
+{
+ int r;
+ uint64_t keys[1];
+ struct writeset_disk value;
+
+ r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
+ &md->current_writeset->md.root);
+ if (r) {
+ DMERR("%s: dm_bitset_flush failed", __func__);
+ return r;
+ }
+
+ ws_pack(&md->current_writeset->md, &value);
+ md->current_writeset->md.root = INVALID_WRITESET_ROOT;
+
+ keys[0] = md->current_era;
+ __dm_bless_for_disk(&value);
+ r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root,
+ keys, &value, &md->writeset_tree_root);
+ if (r) {
+ DMERR("%s: couldn't insert writeset into btree", __func__);
+ /* FIXME: fail mode */
+ return r;
+ }
+
+ md->archived_writesets = true;
+
+ return 0;
+}
+
+static struct writeset *next_writeset(struct era_metadata *md)
+{
+ return (md->current_writeset == &md->writesets[0]) ?
+ &md->writesets[1] : &md->writesets[0];
+}
+
+static int metadata_new_era(struct era_metadata *md)
+{
+ int r;
+ struct writeset *new_writeset = next_writeset(md);
+
+ r = writeset_init(&md->bitset_info, new_writeset);
+ if (r) {
+ DMERR("%s: writeset_init failed", __func__);
+ return r;
+ }
+
+ swap_writeset(md, new_writeset);
+ md->current_era++;
+
+ return 0;
+}
+
+static int metadata_era_rollover(struct era_metadata *md)
+{
+ int r;
+
+ if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
+ r = metadata_era_archive(md);
+ if (r) {
+ DMERR("%s: metadata_archive_era failed", __func__);
+ /* FIXME: fail mode? */
+ return r;
+ }
+ }
+
+ r = metadata_new_era(md);
+ if (r) {
+ DMERR("%s: new era failed", __func__);
+ /* FIXME: fail mode */
+ return r;
+ }
+
+ return 0;
+}
+
+static bool metadata_current_marked(struct era_metadata *md, dm_block_t block)
+{
+ bool r;
+ struct writeset *ws;
+
+ rcu_read_lock();
+ ws = rcu_dereference(md->current_writeset);
+ r = writeset_marked(ws, block);
+ rcu_read_unlock();
+
+ return r;
+}
+
+static int metadata_commit(struct era_metadata *md)
+{
+ int r;
+ struct dm_block *sblock;
+
+ if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) {
+ r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
+ &md->current_writeset->md.root);
+ if (r) {
+ DMERR("%s: bitset flush failed", __func__);
+ return r;
+ }
+ }
+
+ r = save_sm_root(md);
+ if (r) {
+ DMERR("%s: save_sm_root failed", __func__);
+ return r;
+ }
+
+ r = dm_tm_pre_commit(md->tm);
+ if (r) {
+ DMERR("%s: pre commit failed", __func__);
+ return r;
+ }
+
+ r = superblock_lock(md, &sblock);
+ if (r) {
+ DMERR("%s: superblock lock failed", __func__);
+ return r;
+ }
+
+ prepare_superblock(md, dm_block_data(sblock));
+
+ return dm_tm_commit(md->tm, sblock);
+}
+
+static int metadata_checkpoint(struct era_metadata *md)
+{
+ /*
+ * For now we just rollover, but later I want to put a check in to
+ * avoid this if the filter is still pretty fresh.
+ */
+ return metadata_era_rollover(md);
+}
+
+/*
+ * Metadata snapshots allow userland to access era data.
+ */
+static int metadata_take_snap(struct era_metadata *md)
+{
+ int r, inc;
+ struct dm_block *clone;
+
+ if (md->metadata_snap != SUPERBLOCK_LOCATION) {
+ DMERR("%s: metadata snapshot already exists", __func__);
+ return -EINVAL;
+ }
+
+ r = metadata_era_rollover(md);
+ if (r) {
+ DMERR("%s: era rollover failed", __func__);
+ return r;
+ }
+
+ r = metadata_commit(md);
+ if (r) {
+ DMERR("%s: pre commit failed", __func__);
+ return r;
+ }
+
+ r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION);
+ if (r) {
+ DMERR("%s: couldn't increment superblock", __func__);
+ return r;
+ }
+
+ r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION,
+ &sb_validator, &clone, &inc);
+ if (r) {
+ DMERR("%s: couldn't shadow superblock", __func__);
+ dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION);
+ return r;
+ }
+ BUG_ON(!inc);
+
+ r = dm_sm_inc_block(md->sm, md->writeset_tree_root);
+ if (r) {
+ DMERR("%s: couldn't inc writeset tree root", __func__);
+ dm_tm_unlock(md->tm, clone);
+ return r;
+ }
+
+ r = dm_sm_inc_block(md->sm, md->era_array_root);
+ if (r) {
+ DMERR("%s: couldn't inc era tree root", __func__);
+ dm_sm_dec_block(md->sm, md->writeset_tree_root);
+ dm_tm_unlock(md->tm, clone);
+ return r;
+ }
+
+ md->metadata_snap = dm_block_location(clone);
+
+ r = dm_tm_unlock(md->tm, clone);
+ if (r) {
+ DMERR("%s: couldn't unlock clone", __func__);
+ md->metadata_snap = SUPERBLOCK_LOCATION;
+ return r;
+ }
+
+ return 0;
+}
+
+static int metadata_drop_snap(struct era_metadata *md)
+{
+ int r;
+ dm_block_t location;
+ struct dm_block *clone;
+ struct superblock_disk *disk;
+
+ if (md->metadata_snap == SUPERBLOCK_LOCATION) {
+ DMERR("%s: no snap to drop", __func__);
+ return -EINVAL;
+ }
+
+ r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone);
+ if (r) {
+ DMERR("%s: couldn't read lock superblock clone", __func__);
+ return r;
+ }
+
+ /*
+ * Whatever happens now we'll commit with no record of the metadata
+ * snap.
+ */
+ md->metadata_snap = SUPERBLOCK_LOCATION;
+
+ disk = dm_block_data(clone);
+ r = dm_btree_del(&md->writeset_tree_info,
+ le64_to_cpu(disk->writeset_tree_root));
+ if (r) {
+ DMERR("%s: error deleting writeset tree clone", __func__);
+ dm_tm_unlock(md->tm, clone);
+ return r;
+ }
+
+ r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root));
+ if (r) {
+ DMERR("%s: error deleting era array clone", __func__);
+ dm_tm_unlock(md->tm, clone);
+ return r;
+ }
+
+ location = dm_block_location(clone);
+ dm_tm_unlock(md->tm, clone);
+
+ return dm_sm_dec_block(md->sm, location);
+}
+
+struct metadata_stats {
+ dm_block_t used;
+ dm_block_t total;
+ dm_block_t snap;
+ uint32_t era;
+};
+
+static int metadata_get_stats(struct era_metadata *md, void *ptr)
+{
+ int r;
+ struct metadata_stats *s = ptr;
+ dm_block_t nr_free, nr_total;
+
+ r = dm_sm_get_nr_free(md->sm, &nr_free);
+ if (r) {
+ DMERR("dm_sm_get_nr_free returned %d", r);
+ return r;
+ }
+
+ r = dm_sm_get_nr_blocks(md->sm, &nr_total);
+ if (r) {
+ DMERR("dm_pool_get_metadata_dev_size returned %d", r);
+ return r;
+ }
+
+ s->used = nr_total - nr_free;
+ s->total = nr_total;
+ s->snap = md->metadata_snap;
+ s->era = md->current_era;
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+struct era {
+ struct dm_target *ti;
+ struct dm_target_callbacks callbacks;
+
+ struct dm_dev *metadata_dev;
+ struct dm_dev *origin_dev;
+
+ dm_block_t nr_blocks;
+ uint32_t sectors_per_block;
+ int sectors_per_block_shift;
+ struct era_metadata *md;
+
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t deferred_lock;
+ struct bio_list deferred_bios;
+
+ spinlock_t rpc_lock;
+ struct list_head rpc_calls;
+
+ struct digest digest;
+ atomic_t suspended;
+};
+
+struct rpc {
+ struct list_head list;
+
+ int (*fn0)(struct era_metadata *);
+ int (*fn1)(struct era_metadata *, void *);
+ void *arg;
+ int result;
+
+ struct completion complete;
+};
+
+/*----------------------------------------------------------------
+ * Remapping.
+ *---------------------------------------------------------------*/
+static bool block_size_is_power_of_two(struct era *era)
+{
+ return era->sectors_per_block_shift >= 0;
+}
+
+static dm_block_t get_block(struct era *era, struct bio *bio)
+{
+ sector_t block_nr = bio->bi_iter.bi_sector;
+
+ if (!block_size_is_power_of_two(era))
+ (void) sector_div(block_nr, era->sectors_per_block);
+ else
+ block_nr >>= era->sectors_per_block_shift;
+
+ return block_nr;
+}
+
+static void remap_to_origin(struct era *era, struct bio *bio)
+{
+ bio->bi_bdev = era->origin_dev->bdev;
+}
+
+/*----------------------------------------------------------------
+ * Worker thread
+ *--------------------------------------------------------------*/
+static void wake_worker(struct era *era)
+{
+ if (!atomic_read(&era->suspended))
+ queue_work(era->wq, &era->worker);
+}
+
+static void process_old_eras(struct era *era)
+{
+ int r;
+
+ if (!era->digest.step)
+ return;
+
+ r = era->digest.step(era->md, &era->digest);
+ if (r < 0) {
+ DMERR("%s: digest step failed, stopping digestion", __func__);
+ era->digest.step = NULL;
+
+ } else if (era->digest.step)
+ wake_worker(era);
+}
+
+static void process_deferred_bios(struct era *era)
+{
+ int r;
+ struct bio_list deferred_bios, marked_bios;
+ struct bio *bio;
+ bool commit_needed = false;
+ bool failed = false;
+
+ bio_list_init(&deferred_bios);
+ bio_list_init(&marked_bios);
+
+ spin_lock(&era->deferred_lock);
+ bio_list_merge(&deferred_bios, &era->deferred_bios);
+ bio_list_init(&era->deferred_bios);
+ spin_unlock(&era->deferred_lock);
+
+ while ((bio = bio_list_pop(&deferred_bios))) {
+ r = writeset_test_and_set(&era->md->bitset_info,
+ era->md->current_writeset,
+ get_block(era, bio));
+ if (r < 0) {
+ /*
+ * This is bad news, we need to rollback.
+ * FIXME: finish.
+ */
+ failed = true;
+
+ } else if (r == 0)
+ commit_needed = true;
+
+ bio_list_add(&marked_bios, bio);
+ }
+
+ if (commit_needed) {
+ r = metadata_commit(era->md);
+ if (r)
+ failed = true;
+ }
+
+ if (failed)
+ while ((bio = bio_list_pop(&marked_bios)))
+ bio_io_error(bio);
+ else
+ while ((bio = bio_list_pop(&marked_bios)))
+ generic_make_request(bio);
+}
+
+static void process_rpc_calls(struct era *era)
+{
+ int r;
+ bool need_commit = false;
+ struct list_head calls;
+ struct rpc *rpc, *tmp;
+
+ INIT_LIST_HEAD(&calls);
+ spin_lock(&era->rpc_lock);
+ list_splice_init(&era->rpc_calls, &calls);
+ spin_unlock(&era->rpc_lock);
+
+ list_for_each_entry_safe(rpc, tmp, &calls, list) {
+ rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg);
+ need_commit = true;
+ }
+
+ if (need_commit) {
+ r = metadata_commit(era->md);
+ if (r)
+ list_for_each_entry_safe(rpc, tmp, &calls, list)
+ rpc->result = r;
+ }
+
+ list_for_each_entry_safe(rpc, tmp, &calls, list)
+ complete(&rpc->complete);
+}
+
+static void kick_off_digest(struct era *era)
+{
+ if (era->md->archived_writesets) {
+ era->md->archived_writesets = false;
+ metadata_digest_start(era->md, &era->digest);
+ }
+}
+
+static void do_work(struct work_struct *ws)
+{
+ struct era *era = container_of(ws, struct era, worker);
+
+ kick_off_digest(era);
+ process_old_eras(era);
+ process_deferred_bios(era);
+ process_rpc_calls(era);
+}
+
+static void defer_bio(struct era *era, struct bio *bio)
+{
+ spin_lock(&era->deferred_lock);
+ bio_list_add(&era->deferred_bios, bio);
+ spin_unlock(&era->deferred_lock);
+
+ wake_worker(era);
+}
+
+/*
+ * Make an rpc call to the worker to change the metadata.
+ */
+static int perform_rpc(struct era *era, struct rpc *rpc)
+{
+ rpc->result = 0;
+ init_completion(&rpc->complete);
+
+ spin_lock(&era->rpc_lock);
+ list_add(&rpc->list, &era->rpc_calls);
+ spin_unlock(&era->rpc_lock);
+
+ wake_worker(era);
+ wait_for_completion(&rpc->complete);
+
+ return rpc->result;
+}
+
+static int in_worker0(struct era *era, int (*fn)(struct era_metadata *))
+{
+ struct rpc rpc;
+ rpc.fn0 = fn;
+ rpc.fn1 = NULL;
+
+ return perform_rpc(era, &rpc);
+}
+
+static int in_worker1(struct era *era,
+ int (*fn)(struct era_metadata *, void *), void *arg)
+{
+ struct rpc rpc;
+ rpc.fn0 = NULL;
+ rpc.fn1 = fn;
+ rpc.arg = arg;
+
+ return perform_rpc(era, &rpc);
+}
+
+static void start_worker(struct era *era)
+{
+ atomic_set(&era->suspended, 0);
+}
+
+static void stop_worker(struct era *era)
+{
+ atomic_set(&era->suspended, 1);
+ flush_workqueue(era->wq);
+}
+
+/*----------------------------------------------------------------
+ * Target methods
+ *--------------------------------------------------------------*/
+static int dev_is_congested(struct dm_dev *dev, int bdi_bits)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+ return bdi_congested(&q->backing_dev_info, bdi_bits);
+}
+
+static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+ struct era *era = container_of(cb, struct era, callbacks);
+ return dev_is_congested(era->origin_dev, bdi_bits);
+}
+
+static void era_destroy(struct era *era)
+{
+ if (era->md)
+ metadata_close(era->md);
+
+ if (era->wq)
+ destroy_workqueue(era->wq);
+
+ if (era->origin_dev)
+ dm_put_device(era->ti, era->origin_dev);
+
+ if (era->metadata_dev)
+ dm_put_device(era->ti, era->metadata_dev);
+
+ kfree(era);
+}
+
+static dm_block_t calc_nr_blocks(struct era *era)
+{
+ return dm_sector_div_up(era->ti->len, era->sectors_per_block);
+}
+
+static bool valid_block_size(dm_block_t block_size)
+{
+ bool greater_than_zero = block_size > 0;
+ bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0;
+
+ return greater_than_zero && multiple_of_min_block_size;
+}
+
+/*
+ * <metadata dev> <data dev> <data block size (sectors)>
+ */
+static int era_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r;
+ char dummy;
+ struct era *era;
+ struct era_metadata *md;
+
+ if (argc != 3) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ era = kzalloc(sizeof(*era), GFP_KERNEL);
+ if (!era) {
+ ti->error = "Error allocating era structure";
+ return -ENOMEM;
+ }
+
+ era->ti = ti;
+
+ r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev);
+ if (r) {
+ ti->error = "Error opening metadata device";
+ era_destroy(era);
+ return -EINVAL;
+ }
+
+ r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev);
+ if (r) {
+ ti->error = "Error opening data device";
+ era_destroy(era);
+ return -EINVAL;
+ }
+
+ r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy);
+ if (r != 1) {
+ ti->error = "Error parsing block size";
+ era_destroy(era);
+ return -EINVAL;
+ }
+
+ r = dm_set_target_max_io_len(ti, era->sectors_per_block);
+ if (r) {
+ ti->error = "could not set max io len";
+ era_destroy(era);
+ return -EINVAL;
+ }
+
+ if (!valid_block_size(era->sectors_per_block)) {
+ ti->error = "Invalid block size";
+ era_destroy(era);
+ return -EINVAL;
+ }
+ if (era->sectors_per_block & (era->sectors_per_block - 1))
+ era->sectors_per_block_shift = -1;
+ else
+ era->sectors_per_block_shift = __ffs(era->sectors_per_block);
+
+ md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true);
+ if (IS_ERR(md)) {
+ ti->error = "Error reading metadata";
+ era_destroy(era);
+ return PTR_ERR(md);
+ }
+ era->md = md;
+
+ era->nr_blocks = calc_nr_blocks(era);
+
+ r = metadata_resize(era->md, &era->nr_blocks);
+ if (r) {
+ ti->error = "couldn't resize metadata";
+ era_destroy(era);
+ return -ENOMEM;
+ }
+
+ era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+ if (!era->wq) {
+ ti->error = "could not create workqueue for metadata object";
+ era_destroy(era);
+ return -ENOMEM;
+ }
+ INIT_WORK(&era->worker, do_work);
+
+ spin_lock_init(&era->deferred_lock);
+ bio_list_init(&era->deferred_bios);
+
+ spin_lock_init(&era->rpc_lock);
+ INIT_LIST_HEAD(&era->rpc_calls);
+
+ ti->private = era;
+ ti->num_flush_bios = 1;
+ ti->flush_supported = true;
+
+ ti->num_discard_bios = 1;
+ ti->discards_supported = true;
+ era->callbacks.congested_fn = era_is_congested;
+ dm_table_add_target_callbacks(ti->table, &era->callbacks);
+
+ return 0;
+}
+
+static void era_dtr(struct dm_target *ti)
+{
+ era_destroy(ti->private);
+}
+
+static int era_map(struct dm_target *ti, struct bio *bio)
+{
+ struct era *era = ti->private;
+ dm_block_t block = get_block(era, bio);
+
+ /*
+ * All bios get remapped to the origin device. We do this now, but
+ * it may not get issued until later. Depending on whether the
+ * block is marked in this era.
+ */
+ remap_to_origin(era, bio);
+
+ /*
+ * REQ_FLUSH bios carry no data, so we're not interested in them.
+ */
+ if (!(bio->bi_rw & REQ_FLUSH) &&
+ (bio_data_dir(bio) == WRITE) &&
+ !metadata_current_marked(era->md, block)) {
+ defer_bio(era, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static void era_postsuspend(struct dm_target *ti)
+{
+ int r;
+ struct era *era = ti->private;
+
+ r = in_worker0(era, metadata_era_archive);
+ if (r) {
+ DMERR("%s: couldn't archive current era", __func__);
+ /* FIXME: fail mode */
+ }
+
+ stop_worker(era);
+}
+
+static int era_preresume(struct dm_target *ti)
+{
+ int r;
+ struct era *era = ti->private;
+ dm_block_t new_size = calc_nr_blocks(era);
+
+ if (era->nr_blocks != new_size) {
+ r = in_worker1(era, metadata_resize, &new_size);
+ if (r)
+ return r;
+
+ era->nr_blocks = new_size;
+ }
+
+ start_worker(era);
+
+ r = in_worker0(era, metadata_new_era);
+ if (r) {
+ DMERR("%s: metadata_era_rollover failed", __func__);
+ return r;
+ }
+
+ return 0;
+}
+
+/*
+ * Status format:
+ *
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <current era> <held metadata root | '-'>
+ */
+static void era_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ int r;
+ struct era *era = ti->private;
+ ssize_t sz = 0;
+ struct metadata_stats stats;
+ char buf[BDEVNAME_SIZE];
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ r = in_worker1(era, metadata_get_stats, &stats);
+ if (r)
+ goto err;
+
+ DMEMIT("%u %llu/%llu %u",
+ (unsigned) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
+ (unsigned long long) stats.used,
+ (unsigned long long) stats.total,
+ (unsigned) stats.era);
+
+ if (stats.snap != SUPERBLOCK_LOCATION)
+ DMEMIT(" %llu", stats.snap);
+ else
+ DMEMIT(" -");
+ break;
+
+ case STATUSTYPE_TABLE:
+ format_dev_t(buf, era->metadata_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+ format_dev_t(buf, era->origin_dev->bdev->bd_dev);
+ DMEMIT("%s %u", buf, era->sectors_per_block);
+ break;
+ }
+
+ return;
+
+err:
+ DMEMIT("Error");
+}
+
+static int era_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct era *era = ti->private;
+
+ if (argc != 1) {
+ DMERR("incorrect number of message arguments");
+ return -EINVAL;
+ }
+
+ if (!strcasecmp(argv[0], "checkpoint"))
+ return in_worker0(era, metadata_checkpoint);
+
+ if (!strcasecmp(argv[0], "take_metadata_snap"))
+ return in_worker0(era, metadata_take_snap);
+
+ if (!strcasecmp(argv[0], "drop_metadata_snap"))
+ return in_worker0(era, metadata_drop_snap);
+
+ DMERR("unsupported message '%s'", argv[0]);
+ return -EINVAL;
+}
+
+static sector_t get_dev_size(struct dm_dev *dev)
+{
+ return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static int era_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct era *era = ti->private;
+ return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data);
+}
+
+static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct era *era = ti->private;
+ struct request_queue *q = bdev_get_queue(era->origin_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = era->origin_dev->bdev;
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct era *era = ti->private;
+ uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+ /*
+ * If the system-determined stacked limits are compatible with the
+ * era device's blocksize (io_opt is a factor) do not override them.
+ */
+ if (io_opt_sectors < era->sectors_per_block ||
+ do_div(io_opt_sectors, era->sectors_per_block)) {
+ blk_limits_io_min(limits, 0);
+ blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT);
+ }
+}
+
+/*----------------------------------------------------------------*/
+
+static struct target_type era_target = {
+ .name = "era",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = era_ctr,
+ .dtr = era_dtr,
+ .map = era_map,
+ .postsuspend = era_postsuspend,
+ .preresume = era_preresume,
+ .status = era_status,
+ .message = era_message,
+ .iterate_devices = era_iterate_devices,
+ .merge = era_merge,
+ .io_hints = era_io_hints
+};
+
+static int __init dm_era_init(void)
+{
+ int r;
+
+ r = dm_register_target(&era_target);
+ if (r) {
+ DMERR("era target registration failed: %d", r);
+ return r;
+ }
+
+ return 0;
+}
+
+static void __exit dm_era_exit(void)
+{
+ dm_unregister_target(&era_target);
+}
+
+module_init(dm_era_init);
+module_exit(dm_era_exit);
+
+MODULE_DESCRIPTION(DM_NAME " era target");
+MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
new file mode 100644
index 000000000..ebaa4f803
--- /dev/null
+++ b/drivers/md/dm-exception-store.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-exception-store.h"
+
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#define DM_MSG_PREFIX "snapshot exception stores"
+
+static LIST_HEAD(_exception_store_types);
+static DEFINE_SPINLOCK(_lock);
+
+static struct dm_exception_store_type *__find_exception_store_type(const char *name)
+{
+ struct dm_exception_store_type *type;
+
+ list_for_each_entry(type, &_exception_store_types, list)
+ if (!strcmp(name, type->name))
+ return type;
+
+ return NULL;
+}
+
+static struct dm_exception_store_type *_get_exception_store_type(const char *name)
+{
+ struct dm_exception_store_type *type;
+
+ spin_lock(&_lock);
+
+ type = __find_exception_store_type(name);
+
+ if (type && !try_module_get(type->module))
+ type = NULL;
+
+ spin_unlock(&_lock);
+
+ return type;
+}
+
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dm_exception_store_type by name. If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Exstore modules are named "dm-exstore-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-exstore-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-shared", it would search
+ * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'.
+ *
+ * 'dm-exception-store-<type_name>' is too long of a name in my
+ * opinion, which is why I've chosen to have the files
+ * containing exception store implementations be 'dm-exstore-<type_name>'.
+ * If you want your module to be autoloaded, you will follow this
+ * naming convention.
+ *
+ * Returns: dm_exception_store_type* on success, NULL on failure
+ */
+static struct dm_exception_store_type *get_type(const char *type_name)
+{
+ char *p, *type_name_dup;
+ struct dm_exception_store_type *type;
+
+ type = _get_exception_store_type(type_name);
+ if (type)
+ return type;
+
+ type_name_dup = kstrdup(type_name, GFP_KERNEL);
+ if (!type_name_dup) {
+ DMERR("No memory left to attempt load for \"%s\"", type_name);
+ return NULL;
+ }
+
+ while (request_module("dm-exstore-%s", type_name_dup) ||
+ !(type = _get_exception_store_type(type_name))) {
+ p = strrchr(type_name_dup, '-');
+ if (!p)
+ break;
+ p[0] = '\0';
+ }
+
+ if (!type)
+ DMWARN("Module for exstore type \"%s\" not found.", type_name);
+
+ kfree(type_name_dup);
+
+ return type;
+}
+
+static void put_type(struct dm_exception_store_type *type)
+{
+ spin_lock(&_lock);
+ module_put(type->module);
+ spin_unlock(&_lock);
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type)
+{
+ int r = 0;
+
+ spin_lock(&_lock);
+ if (!__find_exception_store_type(type->name))
+ list_add(&type->list, &_exception_store_types);
+ else
+ r = -EEXIST;
+ spin_unlock(&_lock);
+
+ return r;
+}
+EXPORT_SYMBOL(dm_exception_store_type_register);
+
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
+{
+ spin_lock(&_lock);
+
+ if (!__find_exception_store_type(type->name)) {
+ spin_unlock(&_lock);
+ return -EINVAL;
+ }
+
+ list_del(&type->list);
+
+ spin_unlock(&_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(dm_exception_store_type_unregister);
+
+static int set_chunk_size(struct dm_exception_store *store,
+ const char *chunk_size_arg, char **error)
+{
+ unsigned chunk_size;
+
+ if (kstrtouint(chunk_size_arg, 10, &chunk_size)) {
+ *error = "Invalid chunk size";
+ return -EINVAL;
+ }
+
+ if (!chunk_size) {
+ store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
+ return 0;
+ }
+
+ return dm_exception_store_set_chunk_size(store, chunk_size, error);
+}
+
+int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
+ unsigned chunk_size,
+ char **error)
+{
+ /* Check chunk_size is a power of 2 */
+ if (!is_power_of_2(chunk_size)) {
+ *error = "Chunk size is not a power of 2";
+ return -EINVAL;
+ }
+
+ /* Validate the chunk size against the device block size */
+ if (chunk_size %
+ (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) ||
+ chunk_size %
+ (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) {
+ *error = "Chunk size is not a multiple of device blocksize";
+ return -EINVAL;
+ }
+
+ if (chunk_size > INT_MAX >> SECTOR_SHIFT) {
+ *error = "Chunk size is too high";
+ return -EINVAL;
+ }
+
+ store->chunk_size = chunk_size;
+ store->chunk_mask = chunk_size - 1;
+ store->chunk_shift = ffs(chunk_size) - 1;
+
+ return 0;
+}
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+ struct dm_snapshot *snap,
+ unsigned *args_used,
+ struct dm_exception_store **store)
+{
+ int r = 0;
+ struct dm_exception_store_type *type = NULL;
+ struct dm_exception_store *tmp_store;
+ char persistent;
+
+ if (argc < 2) {
+ ti->error = "Insufficient exception store arguments";
+ return -EINVAL;
+ }
+
+ tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL);
+ if (!tmp_store) {
+ ti->error = "Exception store allocation failed";
+ return -ENOMEM;
+ }
+
+ persistent = toupper(*argv[0]);
+ if (persistent == 'P')
+ type = get_type("P");
+ else if (persistent == 'N')
+ type = get_type("N");
+ else {
+ ti->error = "Persistent flag is not P or N";
+ r = -EINVAL;
+ goto bad_type;
+ }
+
+ if (!type) {
+ ti->error = "Exception store type not recognised";
+ r = -EINVAL;
+ goto bad_type;
+ }
+
+ tmp_store->type = type;
+ tmp_store->snap = snap;
+
+ r = set_chunk_size(tmp_store, argv[1], &ti->error);
+ if (r)
+ goto bad;
+
+ r = type->ctr(tmp_store, 0, NULL);
+ if (r) {
+ ti->error = "Exception store type constructor failed";
+ goto bad;
+ }
+
+ *args_used = 2;
+ *store = tmp_store;
+ return 0;
+
+bad:
+ put_type(type);
+bad_type:
+ kfree(tmp_store);
+ return r;
+}
+EXPORT_SYMBOL(dm_exception_store_create);
+
+void dm_exception_store_destroy(struct dm_exception_store *store)
+{
+ store->type->dtr(store);
+ put_type(store->type);
+ kfree(store);
+}
+EXPORT_SYMBOL(dm_exception_store_destroy);
+
+int dm_exception_store_init(void)
+{
+ int r;
+
+ r = dm_transient_snapshot_init();
+ if (r) {
+ DMERR("Unable to register transient exception store type.");
+ goto transient_fail;
+ }
+
+ r = dm_persistent_snapshot_init();
+ if (r) {
+ DMERR("Unable to register persistent exception store type");
+ goto persistent_fail;
+ }
+
+ return 0;
+
+persistent_fail:
+ dm_transient_snapshot_exit();
+transient_fail:
+ return r;
+}
+
+void dm_exception_store_exit(void)
+{
+ dm_persistent_snapshot_exit();
+ dm_transient_snapshot_exit();
+}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
new file mode 100644
index 000000000..0b2536247
--- /dev/null
+++ b/drivers/md/dm-exception-store.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * Device-mapper snapshot exception store.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _LINUX_DM_EXCEPTION_STORE
+#define _LINUX_DM_EXCEPTION_STORE
+
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 32k - 512k.
+ */
+typedef sector_t chunk_t;
+
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
+ * of chunks that follow contiguously. Remaining bits hold the number of the
+ * chunk within the device.
+ */
+struct dm_exception {
+ struct list_head hash_list;
+
+ chunk_t old_chunk;
+ chunk_t new_chunk;
+};
+
+/*
+ * Abstraction to handle the meta/layout of exception stores (the
+ * COW device).
+ */
+struct dm_exception_store;
+struct dm_exception_store_type {
+ const char *name;
+ struct module *module;
+
+ int (*ctr) (struct dm_exception_store *store,
+ unsigned argc, char **argv);
+
+ /*
+ * Destroys this object when you've finished with it.
+ */
+ void (*dtr) (struct dm_exception_store *store);
+
+ /*
+ * The target shouldn't read the COW device until this is
+ * called. As exceptions are read from the COW, they are
+ * reported back via the callback.
+ */
+ int (*read_metadata) (struct dm_exception_store *store,
+ int (*callback)(void *callback_context,
+ chunk_t old, chunk_t new),
+ void *callback_context);
+
+ /*
+ * Find somewhere to store the next exception.
+ */
+ int (*prepare_exception) (struct dm_exception_store *store,
+ struct dm_exception *e);
+
+ /*
+ * Update the metadata with this exception.
+ */
+ void (*commit_exception) (struct dm_exception_store *store,
+ struct dm_exception *e,
+ void (*callback) (void *, int success),
+ void *callback_context);
+
+ /*
+ * Returns 0 if the exception store is empty.
+ *
+ * If there are exceptions still to be merged, sets
+ * *last_old_chunk and *last_new_chunk to the most recent
+ * still-to-be-merged chunk and returns the number of
+ * consecutive previous ones.
+ */
+ int (*prepare_merge) (struct dm_exception_store *store,
+ chunk_t *last_old_chunk, chunk_t *last_new_chunk);
+
+ /*
+ * Clear the last n exceptions.
+ * nr_merged must be <= the value returned by prepare_merge.
+ */
+ int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
+
+ /*
+ * The snapshot is invalid, note this in the metadata.
+ */
+ void (*drop_snapshot) (struct dm_exception_store *store);
+
+ unsigned (*status) (struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen);
+
+ /*
+ * Return how full the snapshot is.
+ */
+ void (*usage) (struct dm_exception_store *store,
+ sector_t *total_sectors, sector_t *sectors_allocated,
+ sector_t *metadata_sectors);
+
+ /* For internal device-mapper use only. */
+ struct list_head list;
+};
+
+struct dm_snapshot;
+
+struct dm_exception_store {
+ struct dm_exception_store_type *type;
+ struct dm_snapshot *snap;
+
+ /* Size of data blocks saved - must be a power of 2 */
+ unsigned chunk_size;
+ unsigned chunk_mask;
+ unsigned chunk_shift;
+
+ void *context;
+};
+
+/*
+ * Obtain the origin or cow device used by a given snapshot.
+ */
+struct dm_dev *dm_snap_origin(struct dm_snapshot *snap);
+struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
+
+/*
+ * Funtions to manipulate consecutive chunks
+ */
+# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
+# define DM_CHUNK_CONSECUTIVE_BITS 8
+# define DM_CHUNK_NUMBER_BITS 56
+
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+ return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
+}
+
+static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
+{
+ return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
+}
+
+static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
+{
+ e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
+
+ BUG_ON(!dm_consecutive_chunk_count(e));
+}
+
+static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
+{
+ BUG_ON(!dm_consecutive_chunk_count(e));
+
+ e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
+}
+
+# else
+# define DM_CHUNK_CONSECUTIVE_BITS 0
+
+static inline chunk_t dm_chunk_number(chunk_t chunk)
+{
+ return chunk;
+}
+
+static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
+{
+ return 0;
+}
+
+static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
+{
+}
+
+static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
+{
+}
+
+# endif
+
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(struct block_device *bdev)
+{
+ return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
+ sector_t sector)
+{
+ return sector >> store->chunk_shift;
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type);
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
+
+int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
+ unsigned chunk_size,
+ char **error);
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+ struct dm_snapshot *snap,
+ unsigned *args_used,
+ struct dm_exception_store **store);
+void dm_exception_store_destroy(struct dm_exception_store *store);
+
+int dm_exception_store_init(void);
+void dm_exception_store_exit(void);
+
+/*
+ * Two exception store implementations.
+ */
+int dm_persistent_snapshot_init(void);
+void dm_persistent_snapshot_exit(void);
+
+int dm_transient_snapshot_init(void);
+void dm_transient_snapshot_exit(void);
+
+#endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
new file mode 100644
index 000000000..b257e4687
--- /dev/null
+++ b/drivers/md/dm-flakey.c
@@ -0,0 +1,447 @@
+/*
+ * Copyright (C) 2003 Sistina Software (UK) Limited.
+ * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#define DM_MSG_PREFIX "flakey"
+
+#define all_corrupt_bio_flags_match(bio, fc) \
+ (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
+
+/*
+ * Flakey: Used for testing only, simulates intermittent,
+ * catastrophic device failure.
+ */
+struct flakey_c {
+ struct dm_dev *dev;
+ unsigned long start_time;
+ sector_t start;
+ unsigned up_interval;
+ unsigned down_interval;
+ unsigned long flags;
+ unsigned corrupt_bio_byte;
+ unsigned corrupt_bio_rw;
+ unsigned corrupt_bio_value;
+ unsigned corrupt_bio_flags;
+};
+
+enum feature_flag_bits {
+ DROP_WRITES
+};
+
+struct per_bio_data {
+ bool bio_submitted;
+};
+
+static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
+ struct dm_target *ti)
+{
+ int r;
+ unsigned argc;
+ const char *arg_name;
+
+ static struct dm_arg _args[] = {
+ {0, 6, "Invalid number of feature args"},
+ {1, UINT_MAX, "Invalid corrupt bio byte"},
+ {0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
+ {0, UINT_MAX, "Invalid corrupt bio flags mask"},
+ };
+
+ /* No feature arguments supplied. */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
+ if (r)
+ return r;
+
+ while (argc) {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ /*
+ * drop_writes
+ */
+ if (!strcasecmp(arg_name, "drop_writes")) {
+ if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
+ ti->error = "Feature drop_writes duplicated";
+ return -EINVAL;
+ }
+
+ continue;
+ }
+
+ /*
+ * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
+ */
+ if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
+ if (!argc) {
+ ti->error = "Feature corrupt_bio_byte requires parameters";
+ return -EINVAL;
+ }
+
+ r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
+ if (r)
+ return r;
+ argc--;
+
+ /*
+ * Direction r or w?
+ */
+ arg_name = dm_shift_arg(as);
+ if (!strcasecmp(arg_name, "w"))
+ fc->corrupt_bio_rw = WRITE;
+ else if (!strcasecmp(arg_name, "r"))
+ fc->corrupt_bio_rw = READ;
+ else {
+ ti->error = "Invalid corrupt bio direction (r or w)";
+ return -EINVAL;
+ }
+ argc--;
+
+ /*
+ * Value of byte (0-255) to write in place of correct one.
+ */
+ r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
+ if (r)
+ return r;
+ argc--;
+
+ /*
+ * Only corrupt bios with these flags set.
+ */
+ r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
+ if (r)
+ return r;
+ argc--;
+
+ continue;
+ }
+
+ ti->error = "Unrecognised flakey feature requested";
+ return -EINVAL;
+ }
+
+ if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
+ ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Construct a flakey mapping:
+ * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
+ *
+ * Feature args:
+ * [drop_writes]
+ * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
+ *
+ * Nth_byte starts from 1 for the first byte.
+ * Direction is r for READ or w for WRITE.
+ * bio_flags is ignored if 0.
+ */
+static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ static struct dm_arg _args[] = {
+ {0, UINT_MAX, "Invalid up interval"},
+ {0, UINT_MAX, "Invalid down interval"},
+ };
+
+ int r;
+ struct flakey_c *fc;
+ unsigned long long tmpll;
+ struct dm_arg_set as;
+ const char *devname;
+ char dummy;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ if (argc < 4) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ fc = kzalloc(sizeof(*fc), GFP_KERNEL);
+ if (!fc) {
+ ti->error = "Cannot allocate context";
+ return -ENOMEM;
+ }
+ fc->start_time = jiffies;
+
+ devname = dm_shift_arg(&as);
+
+ if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
+ ti->error = "Invalid device sector";
+ goto bad;
+ }
+ fc->start = tmpll;
+
+ r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error);
+ if (r)
+ goto bad;
+
+ r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
+ if (r)
+ goto bad;
+
+ if (!(fc->up_interval + fc->down_interval)) {
+ ti->error = "Total (up + down) interval is zero";
+ goto bad;
+ }
+
+ if (fc->up_interval + fc->down_interval < fc->up_interval) {
+ ti->error = "Interval overflow";
+ goto bad;
+ }
+
+ r = parse_features(&as, fc, ti);
+ if (r)
+ goto bad;
+
+ if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
+ ti->error = "Device lookup failed";
+ goto bad;
+ }
+
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
+ ti->per_bio_data_size = sizeof(struct per_bio_data);
+ ti->private = fc;
+ return 0;
+
+bad:
+ kfree(fc);
+ return -EINVAL;
+}
+
+static void flakey_dtr(struct dm_target *ti)
+{
+ struct flakey_c *fc = ti->private;
+
+ dm_put_device(ti, fc->dev);
+ kfree(fc);
+}
+
+static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
+{
+ struct flakey_c *fc = ti->private;
+
+ return fc->start + dm_target_offset(ti, bi_sector);
+}
+
+static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
+{
+ struct flakey_c *fc = ti->private;
+
+ bio->bi_bdev = fc->dev->bdev;
+ if (bio_sectors(bio))
+ bio->bi_iter.bi_sector =
+ flakey_map_sector(ti, bio->bi_iter.bi_sector);
+}
+
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+{
+ unsigned bio_bytes = bio_cur_bytes(bio);
+ char *data = bio_data(bio);
+
+ /*
+ * Overwrite the Nth byte of the data returned.
+ */
+ if (data && bio_bytes >= fc->corrupt_bio_byte) {
+ data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
+
+ DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+ "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
+ bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+ (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw,
+ (unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
+ }
+}
+
+static int flakey_map(struct dm_target *ti, struct bio *bio)
+{
+ struct flakey_c *fc = ti->private;
+ unsigned elapsed;
+ struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+ pb->bio_submitted = false;
+
+ /* Are we alive ? */
+ elapsed = (jiffies - fc->start_time) / HZ;
+ if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
+ /*
+ * Flag this bio as submitted while down.
+ */
+ pb->bio_submitted = true;
+
+ /*
+ * Map reads as normal.
+ */
+ if (bio_data_dir(bio) == READ)
+ goto map_bio;
+
+ /*
+ * Drop writes?
+ */
+ if (test_bit(DROP_WRITES, &fc->flags)) {
+ bio_endio(bio, 0);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /*
+ * Corrupt matching writes.
+ */
+ if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
+ if (all_corrupt_bio_flags_match(bio, fc))
+ corrupt_bio_data(bio, fc);
+ goto map_bio;
+ }
+
+ /*
+ * By default, error all I/O.
+ */
+ return -EIO;
+ }
+
+map_bio:
+ flakey_map_bio(ti, bio);
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+ struct flakey_c *fc = ti->private;
+ struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+
+ /*
+ * Corrupt successful READs while in down state.
+ * If flags were specified, only corrupt those that match.
+ */
+ if (fc->corrupt_bio_byte && !error && pb->bio_submitted &&
+ (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
+ all_corrupt_bio_flags_match(bio, fc))
+ corrupt_bio_data(bio, fc);
+
+ return error;
+}
+
+static void flakey_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ unsigned sz = 0;
+ struct flakey_c *fc = ti->private;
+ unsigned drop_writes;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %llu %u %u ", fc->dev->name,
+ (unsigned long long)fc->start, fc->up_interval,
+ fc->down_interval);
+
+ drop_writes = test_bit(DROP_WRITES, &fc->flags);
+ DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
+
+ if (drop_writes)
+ DMEMIT("drop_writes ");
+
+ if (fc->corrupt_bio_byte)
+ DMEMIT("corrupt_bio_byte %u %c %u %u ",
+ fc->corrupt_bio_byte,
+ (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
+ fc->corrupt_bio_value, fc->corrupt_bio_flags);
+
+ break;
+ }
+}
+
+static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
+{
+ struct flakey_c *fc = ti->private;
+ struct dm_dev *dev = fc->dev;
+ int r = 0;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (fc->start ||
+ ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+}
+
+static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct flakey_c *fc = ti->private;
+ struct request_queue *q = bdev_get_queue(fc->dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = fc->dev->bdev;
+ bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+ struct flakey_c *fc = ti->private;
+
+ return fn(ti, fc->dev, fc->start, ti->len, data);
+}
+
+static struct target_type flakey_target = {
+ .name = "flakey",
+ .version = {1, 3, 1},
+ .module = THIS_MODULE,
+ .ctr = flakey_ctr,
+ .dtr = flakey_dtr,
+ .map = flakey_map,
+ .end_io = flakey_end_io,
+ .status = flakey_status,
+ .ioctl = flakey_ioctl,
+ .merge = flakey_merge,
+ .iterate_devices = flakey_iterate_devices,
+};
+
+static int __init dm_flakey_init(void)
+{
+ int r = dm_register_target(&flakey_target);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_flakey_exit(void)
+{
+ dm_unregister_target(&flakey_target);
+}
+
+/* Module hooks */
+module_init(dm_flakey_init);
+module_exit(dm_flakey_exit);
+
+MODULE_DESCRIPTION(DM_NAME " flakey target");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
new file mode 100644
index 000000000..74adcd2c9
--- /dev/null
+++ b/drivers/md/dm-io.c
@@ -0,0 +1,540 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ * Copyright (C) 2006 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/device-mapper.h>
+
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+
+#define DM_MSG_PREFIX "io"
+
+#define DM_IO_MAX_REGIONS BITS_PER_LONG
+
+struct dm_io_client {
+ mempool_t *pool;
+ struct bio_set *bios;
+};
+
+/*
+ * Aligning 'struct io' reduces the number of bits required to store
+ * its address. Refer to store_io_and_region_in_bio() below.
+ */
+struct io {
+ unsigned long error_bits;
+ atomic_t count;
+ struct dm_io_client *client;
+ io_notify_fn callback;
+ void *context;
+ void *vma_invalidate_address;
+ unsigned long vma_invalidate_size;
+} __attribute__((aligned(DM_IO_MAX_REGIONS)));
+
+static struct kmem_cache *_dm_io_cache;
+
+/*
+ * Create a client with mempool and bioset.
+ */
+struct dm_io_client *dm_io_client_create(void)
+{
+ struct dm_io_client *client;
+ unsigned min_ios = dm_get_reserved_bio_based_ios();
+
+ client = kmalloc(sizeof(*client), GFP_KERNEL);
+ if (!client)
+ return ERR_PTR(-ENOMEM);
+
+ client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
+ if (!client->pool)
+ goto bad;
+
+ client->bios = bioset_create(min_ios, 0);
+ if (!client->bios)
+ goto bad;
+
+ return client;
+
+ bad:
+ if (client->pool)
+ mempool_destroy(client->pool);
+ kfree(client);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(dm_io_client_create);
+
+void dm_io_client_destroy(struct dm_io_client *client)
+{
+ mempool_destroy(client->pool);
+ bioset_free(client->bios);
+ kfree(client);
+}
+EXPORT_SYMBOL(dm_io_client_destroy);
+
+/*-----------------------------------------------------------------
+ * We need to keep track of which region a bio is doing io for.
+ * To avoid a memory allocation to store just 5 or 6 bits, we
+ * ensure the 'struct io' pointer is aligned so enough low bits are
+ * always zero and then combine it with the region number directly in
+ * bi_private.
+ *---------------------------------------------------------------*/
+static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
+ unsigned region)
+{
+ if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
+ DMCRIT("Unaligned struct io pointer %p", io);
+ BUG();
+ }
+
+ bio->bi_private = (void *)((unsigned long)io | region);
+}
+
+static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
+ unsigned *region)
+{
+ unsigned long val = (unsigned long)bio->bi_private;
+
+ *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
+ *region = val & (DM_IO_MAX_REGIONS - 1);
+}
+
+/*-----------------------------------------------------------------
+ * We need an io object to keep track of the number of bios that
+ * have been dispatched for a particular io.
+ *---------------------------------------------------------------*/
+static void complete_io(struct io *io)
+{
+ unsigned long error_bits = io->error_bits;
+ io_notify_fn fn = io->callback;
+ void *context = io->context;
+
+ if (io->vma_invalidate_size)
+ invalidate_kernel_vmap_range(io->vma_invalidate_address,
+ io->vma_invalidate_size);
+
+ mempool_free(io, io->client->pool);
+ fn(error_bits, context);
+}
+
+static void dec_count(struct io *io, unsigned int region, int error)
+{
+ if (error)
+ set_bit(region, &io->error_bits);
+
+ if (atomic_dec_and_test(&io->count))
+ complete_io(io);
+}
+
+static void endio(struct bio *bio, int error)
+{
+ struct io *io;
+ unsigned region;
+
+ if (error && bio_data_dir(bio) == READ)
+ zero_fill_bio(bio);
+
+ /*
+ * The bio destructor in bio_put() may use the io object.
+ */
+ retrieve_io_and_region_from_bio(bio, &io, &region);
+
+ bio_put(bio);
+
+ dec_count(io, region, error);
+}
+
+/*-----------------------------------------------------------------
+ * These little objects provide an abstraction for getting a new
+ * destination page for io.
+ *---------------------------------------------------------------*/
+struct dpages {
+ void (*get_page)(struct dpages *dp,
+ struct page **p, unsigned long *len, unsigned *offset);
+ void (*next_page)(struct dpages *dp);
+
+ unsigned context_u;
+ void *context_ptr;
+
+ void *vma_invalidate_address;
+ unsigned long vma_invalidate_size;
+};
+
+/*
+ * Functions for getting the pages from a list.
+ */
+static void list_get_page(struct dpages *dp,
+ struct page **p, unsigned long *len, unsigned *offset)
+{
+ unsigned o = dp->context_u;
+ struct page_list *pl = (struct page_list *) dp->context_ptr;
+
+ *p = pl->page;
+ *len = PAGE_SIZE - o;
+ *offset = o;
+}
+
+static void list_next_page(struct dpages *dp)
+{
+ struct page_list *pl = (struct page_list *) dp->context_ptr;
+ dp->context_ptr = pl->next;
+ dp->context_u = 0;
+}
+
+static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
+{
+ dp->get_page = list_get_page;
+ dp->next_page = list_next_page;
+ dp->context_u = offset;
+ dp->context_ptr = pl;
+}
+
+/*
+ * Functions for getting the pages from a bvec.
+ */
+static void bio_get_page(struct dpages *dp, struct page **p,
+ unsigned long *len, unsigned *offset)
+{
+ struct bio_vec *bvec = dp->context_ptr;
+ *p = bvec->bv_page;
+ *len = bvec->bv_len - dp->context_u;
+ *offset = bvec->bv_offset + dp->context_u;
+}
+
+static void bio_next_page(struct dpages *dp)
+{
+ struct bio_vec *bvec = dp->context_ptr;
+ dp->context_ptr = bvec + 1;
+ dp->context_u = 0;
+}
+
+static void bio_dp_init(struct dpages *dp, struct bio *bio)
+{
+ dp->get_page = bio_get_page;
+ dp->next_page = bio_next_page;
+ dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ dp->context_u = bio->bi_iter.bi_bvec_done;
+}
+
+/*
+ * Functions for getting the pages from a VMA.
+ */
+static void vm_get_page(struct dpages *dp,
+ struct page **p, unsigned long *len, unsigned *offset)
+{
+ *p = vmalloc_to_page(dp->context_ptr);
+ *offset = dp->context_u;
+ *len = PAGE_SIZE - dp->context_u;
+}
+
+static void vm_next_page(struct dpages *dp)
+{
+ dp->context_ptr += PAGE_SIZE - dp->context_u;
+ dp->context_u = 0;
+}
+
+static void vm_dp_init(struct dpages *dp, void *data)
+{
+ dp->get_page = vm_get_page;
+ dp->next_page = vm_next_page;
+ dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+ dp->context_ptr = data;
+}
+
+/*
+ * Functions for getting the pages from kernel memory.
+ */
+static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
+ unsigned *offset)
+{
+ *p = virt_to_page(dp->context_ptr);
+ *offset = dp->context_u;
+ *len = PAGE_SIZE - dp->context_u;
+}
+
+static void km_next_page(struct dpages *dp)
+{
+ dp->context_ptr += PAGE_SIZE - dp->context_u;
+ dp->context_u = 0;
+}
+
+static void km_dp_init(struct dpages *dp, void *data)
+{
+ dp->get_page = km_get_page;
+ dp->next_page = km_next_page;
+ dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+ dp->context_ptr = data;
+}
+
+/*-----------------------------------------------------------------
+ * IO routines that accept a list of pages.
+ *---------------------------------------------------------------*/
+static void do_region(int rw, unsigned region, struct dm_io_region *where,
+ struct dpages *dp, struct io *io)
+{
+ struct bio *bio;
+ struct page *page;
+ unsigned long len;
+ unsigned offset;
+ unsigned num_bvecs;
+ sector_t remaining = where->count;
+ struct request_queue *q = bdev_get_queue(where->bdev);
+ unsigned short logical_block_size = queue_logical_block_size(q);
+ sector_t num_sectors;
+ unsigned int uninitialized_var(special_cmd_max_sectors);
+
+ /*
+ * Reject unsupported discard and write same requests.
+ */
+ if (rw & REQ_DISCARD)
+ special_cmd_max_sectors = q->limits.max_discard_sectors;
+ else if (rw & REQ_WRITE_SAME)
+ special_cmd_max_sectors = q->limits.max_write_same_sectors;
+ if ((rw & (REQ_DISCARD | REQ_WRITE_SAME)) && special_cmd_max_sectors == 0) {
+ dec_count(io, region, -EOPNOTSUPP);
+ return;
+ }
+
+ /*
+ * where->count may be zero if rw holds a flush and we need to
+ * send a zero-sized flush.
+ */
+ do {
+ /*
+ * Allocate a suitably sized-bio.
+ */
+ if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME))
+ num_bvecs = 1;
+ else
+ num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
+ dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+
+ bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
+ bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
+ bio->bi_bdev = where->bdev;
+ bio->bi_end_io = endio;
+ store_io_and_region_in_bio(bio, io, region);
+
+ if (rw & REQ_DISCARD) {
+ num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
+ bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
+ remaining -= num_sectors;
+ } else if (rw & REQ_WRITE_SAME) {
+ /*
+ * WRITE SAME only uses a single page.
+ */
+ dp->get_page(dp, &page, &len, &offset);
+ bio_add_page(bio, page, logical_block_size, offset);
+ num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
+ bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
+
+ offset = 0;
+ remaining -= num_sectors;
+ dp->next_page(dp);
+ } else while (remaining) {
+ /*
+ * Try and add as many pages as possible.
+ */
+ dp->get_page(dp, &page, &len, &offset);
+ len = min(len, to_bytes(remaining));
+ if (!bio_add_page(bio, page, len, offset))
+ break;
+
+ offset = 0;
+ remaining -= to_sector(len);
+ dp->next_page(dp);
+ }
+
+ atomic_inc(&io->count);
+ submit_bio(rw, bio);
+ } while (remaining);
+}
+
+static void dispatch_io(int rw, unsigned int num_regions,
+ struct dm_io_region *where, struct dpages *dp,
+ struct io *io, int sync)
+{
+ int i;
+ struct dpages old_pages = *dp;
+
+ BUG_ON(num_regions > DM_IO_MAX_REGIONS);
+
+ if (sync)
+ rw |= REQ_SYNC;
+
+ /*
+ * For multiple regions we need to be careful to rewind
+ * the dp object for each call to do_region.
+ */
+ for (i = 0; i < num_regions; i++) {
+ *dp = old_pages;
+ if (where[i].count || (rw & REQ_FLUSH))
+ do_region(rw, i, where + i, dp, io);
+ }
+
+ /*
+ * Drop the extra reference that we were holding to avoid
+ * the io being completed too early.
+ */
+ dec_count(io, 0, 0);
+}
+
+struct sync_io {
+ unsigned long error_bits;
+ struct completion wait;
+};
+
+static void sync_io_complete(unsigned long error, void *context)
+{
+ struct sync_io *sio = context;
+
+ sio->error_bits = error;
+ complete(&sio->wait);
+}
+
+static int sync_io(struct dm_io_client *client, unsigned int num_regions,
+ struct dm_io_region *where, int rw, struct dpages *dp,
+ unsigned long *error_bits)
+{
+ struct io *io;
+ struct sync_io sio;
+
+ if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
+ WARN_ON(1);
+ return -EIO;
+ }
+
+ init_completion(&sio.wait);
+
+ io = mempool_alloc(client->pool, GFP_NOIO);
+ io->error_bits = 0;
+ atomic_set(&io->count, 1); /* see dispatch_io() */
+ io->client = client;
+ io->callback = sync_io_complete;
+ io->context = &sio;
+
+ io->vma_invalidate_address = dp->vma_invalidate_address;
+ io->vma_invalidate_size = dp->vma_invalidate_size;
+
+ dispatch_io(rw, num_regions, where, dp, io, 1);
+
+ wait_for_completion_io(&sio.wait);
+
+ if (error_bits)
+ *error_bits = sio.error_bits;
+
+ return sio.error_bits ? -EIO : 0;
+}
+
+static int async_io(struct dm_io_client *client, unsigned int num_regions,
+ struct dm_io_region *where, int rw, struct dpages *dp,
+ io_notify_fn fn, void *context)
+{
+ struct io *io;
+
+ if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
+ WARN_ON(1);
+ fn(1, context);
+ return -EIO;
+ }
+
+ io = mempool_alloc(client->pool, GFP_NOIO);
+ io->error_bits = 0;
+ atomic_set(&io->count, 1); /* see dispatch_io() */
+ io->client = client;
+ io->callback = fn;
+ io->context = context;
+
+ io->vma_invalidate_address = dp->vma_invalidate_address;
+ io->vma_invalidate_size = dp->vma_invalidate_size;
+
+ dispatch_io(rw, num_regions, where, dp, io, 0);
+ return 0;
+}
+
+static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
+ unsigned long size)
+{
+ /* Set up dpages based on memory type */
+
+ dp->vma_invalidate_address = NULL;
+ dp->vma_invalidate_size = 0;
+
+ switch (io_req->mem.type) {
+ case DM_IO_PAGE_LIST:
+ list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
+ break;
+
+ case DM_IO_BIO:
+ bio_dp_init(dp, io_req->mem.ptr.bio);
+ break;
+
+ case DM_IO_VMA:
+ flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
+ if ((io_req->bi_rw & RW_MASK) == READ) {
+ dp->vma_invalidate_address = io_req->mem.ptr.vma;
+ dp->vma_invalidate_size = size;
+ }
+ vm_dp_init(dp, io_req->mem.ptr.vma);
+ break;
+
+ case DM_IO_KMEM:
+ km_dp_init(dp, io_req->mem.ptr.addr);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * New collapsed (a)synchronous interface.
+ *
+ * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
+ * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw.
+ * If you fail to do one of these, the IO will be submitted to the disk after
+ * q->unplug_delay, which defaults to 3ms in blk-settings.c.
+ */
+int dm_io(struct dm_io_request *io_req, unsigned num_regions,
+ struct dm_io_region *where, unsigned long *sync_error_bits)
+{
+ int r;
+ struct dpages dp;
+
+ r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
+ if (r)
+ return r;
+
+ if (!io_req->notify.fn)
+ return sync_io(io_req->client, num_regions, where,
+ io_req->bi_rw, &dp, sync_error_bits);
+
+ return async_io(io_req->client, num_regions, where, io_req->bi_rw,
+ &dp, io_req->notify.fn, io_req->notify.context);
+}
+EXPORT_SYMBOL(dm_io);
+
+int __init dm_io_init(void)
+{
+ _dm_io_cache = KMEM_CACHE(io, 0);
+ if (!_dm_io_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void dm_io_exit(void)
+{
+ kmem_cache_destroy(_dm_io_cache);
+ _dm_io_cache = NULL;
+}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
new file mode 100644
index 000000000..720ceeb7f
--- /dev/null
+++ b/drivers/md/dm-ioctl.c
@@ -0,0 +1,1958 @@
+/*
+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2004 - 2006 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/dm-ioctl.h>
+#include <linux/hdreg.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+
+#define DM_MSG_PREFIX "ioctl"
+#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
+
+/*-----------------------------------------------------------------
+ * The ioctl interface needs to be able to look up devices by
+ * name or uuid.
+ *---------------------------------------------------------------*/
+struct hash_cell {
+ struct list_head name_list;
+ struct list_head uuid_list;
+
+ char *name;
+ char *uuid;
+ struct mapped_device *md;
+ struct dm_table *new_map;
+};
+
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+ int undefined__;
+};
+
+struct vers_iter {
+ size_t param_size;
+ struct dm_target_versions *vers, *old_vers;
+ char *end;
+ uint32_t flags;
+};
+
+
+#define NUM_BUCKETS 64
+#define MASK_BUCKETS (NUM_BUCKETS - 1)
+static struct list_head _name_buckets[NUM_BUCKETS];
+static struct list_head _uuid_buckets[NUM_BUCKETS];
+
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
+
+/*
+ * Guards access to both hash tables.
+ */
+static DECLARE_RWSEM(_hash_lock);
+
+/*
+ * Protects use of mdptr to obtain hash cell name and uuid from mapped device.
+ */
+static DEFINE_MUTEX(dm_hash_cells_mutex);
+
+static void init_buckets(struct list_head *buckets)
+{
+ unsigned int i;
+
+ for (i = 0; i < NUM_BUCKETS; i++)
+ INIT_LIST_HEAD(buckets + i);
+}
+
+static int dm_hash_init(void)
+{
+ init_buckets(_name_buckets);
+ init_buckets(_uuid_buckets);
+ return 0;
+}
+
+static void dm_hash_exit(void)
+{
+ dm_hash_remove_all(false, false, false);
+}
+
+/*-----------------------------------------------------------------
+ * Hash function:
+ * We're not really concerned with the str hash function being
+ * fast since it's only used by the ioctl interface.
+ *---------------------------------------------------------------*/
+static unsigned int hash_str(const char *str)
+{
+ const unsigned int hash_mult = 2654435387U;
+ unsigned int h = 0;
+
+ while (*str)
+ h = (h + (unsigned int) *str++) * hash_mult;
+
+ return h & MASK_BUCKETS;
+}
+
+/*-----------------------------------------------------------------
+ * Code for looking up a device by name
+ *---------------------------------------------------------------*/
+static struct hash_cell *__get_name_cell(const char *str)
+{
+ struct hash_cell *hc;
+ unsigned int h = hash_str(str);
+
+ list_for_each_entry (hc, _name_buckets + h, name_list)
+ if (!strcmp(hc->name, str)) {
+ dm_get(hc->md);
+ return hc;
+ }
+
+ return NULL;
+}
+
+static struct hash_cell *__get_uuid_cell(const char *str)
+{
+ struct hash_cell *hc;
+ unsigned int h = hash_str(str);
+
+ list_for_each_entry (hc, _uuid_buckets + h, uuid_list)
+ if (!strcmp(hc->uuid, str)) {
+ dm_get(hc->md);
+ return hc;
+ }
+
+ return NULL;
+}
+
+static struct hash_cell *__get_dev_cell(uint64_t dev)
+{
+ struct mapped_device *md;
+ struct hash_cell *hc;
+
+ md = dm_get_md(huge_decode_dev(dev));
+ if (!md)
+ return NULL;
+
+ hc = dm_get_mdptr(md);
+ if (!hc) {
+ dm_put(md);
+ return NULL;
+ }
+
+ return hc;
+}
+
+/*-----------------------------------------------------------------
+ * Inserting, removing and renaming a device.
+ *---------------------------------------------------------------*/
+static struct hash_cell *alloc_cell(const char *name, const char *uuid,
+ struct mapped_device *md)
+{
+ struct hash_cell *hc;
+
+ hc = kmalloc(sizeof(*hc), GFP_KERNEL);
+ if (!hc)
+ return NULL;
+
+ hc->name = kstrdup(name, GFP_KERNEL);
+ if (!hc->name) {
+ kfree(hc);
+ return NULL;
+ }
+
+ if (!uuid)
+ hc->uuid = NULL;
+
+ else {
+ hc->uuid = kstrdup(uuid, GFP_KERNEL);
+ if (!hc->uuid) {
+ kfree(hc->name);
+ kfree(hc);
+ return NULL;
+ }
+ }
+
+ INIT_LIST_HEAD(&hc->name_list);
+ INIT_LIST_HEAD(&hc->uuid_list);
+ hc->md = md;
+ hc->new_map = NULL;
+ return hc;
+}
+
+static void free_cell(struct hash_cell *hc)
+{
+ if (hc) {
+ kfree(hc->name);
+ kfree(hc->uuid);
+ kfree(hc);
+ }
+}
+
+/*
+ * The kdev_t and uuid of a device can never change once it is
+ * initially inserted.
+ */
+static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
+{
+ struct hash_cell *cell, *hc;
+
+ /*
+ * Allocate the new cells.
+ */
+ cell = alloc_cell(name, uuid, md);
+ if (!cell)
+ return -ENOMEM;
+
+ /*
+ * Insert the cell into both hash tables.
+ */
+ down_write(&_hash_lock);
+ hc = __get_name_cell(name);
+ if (hc) {
+ dm_put(hc->md);
+ goto bad;
+ }
+
+ list_add(&cell->name_list, _name_buckets + hash_str(name));
+
+ if (uuid) {
+ hc = __get_uuid_cell(uuid);
+ if (hc) {
+ list_del(&cell->name_list);
+ dm_put(hc->md);
+ goto bad;
+ }
+ list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
+ }
+ dm_get(md);
+ mutex_lock(&dm_hash_cells_mutex);
+ dm_set_mdptr(md, cell);
+ mutex_unlock(&dm_hash_cells_mutex);
+ up_write(&_hash_lock);
+
+ return 0;
+
+ bad:
+ up_write(&_hash_lock);
+ free_cell(cell);
+ return -EBUSY;
+}
+
+static struct dm_table *__hash_remove(struct hash_cell *hc)
+{
+ struct dm_table *table;
+ int srcu_idx;
+
+ /* remove from the dev hash */
+ list_del(&hc->uuid_list);
+ list_del(&hc->name_list);
+ mutex_lock(&dm_hash_cells_mutex);
+ dm_set_mdptr(hc->md, NULL);
+ mutex_unlock(&dm_hash_cells_mutex);
+
+ table = dm_get_live_table(hc->md, &srcu_idx);
+ if (table)
+ dm_table_event(table);
+ dm_put_live_table(hc->md, srcu_idx);
+
+ table = NULL;
+ if (hc->new_map)
+ table = hc->new_map;
+ dm_put(hc->md);
+ free_cell(hc);
+
+ return table;
+}
+
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
+{
+ int i, dev_skipped;
+ struct hash_cell *hc;
+ struct mapped_device *md;
+ struct dm_table *t;
+
+retry:
+ dev_skipped = 0;
+
+ down_write(&_hash_lock);
+
+ for (i = 0; i < NUM_BUCKETS; i++) {
+ list_for_each_entry(hc, _name_buckets + i, name_list) {
+ md = hc->md;
+ dm_get(md);
+
+ if (keep_open_devices &&
+ dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
+ dm_put(md);
+ dev_skipped++;
+ continue;
+ }
+
+ t = __hash_remove(hc);
+
+ up_write(&_hash_lock);
+
+ if (t) {
+ dm_sync_table(md);
+ dm_table_destroy(t);
+ }
+ dm_put(md);
+ if (likely(keep_open_devices))
+ dm_destroy(md);
+ else
+ dm_destroy_immediate(md);
+
+ /*
+ * Some mapped devices may be using other mapped
+ * devices, so repeat until we make no further
+ * progress. If a new mapped device is created
+ * here it will also get removed.
+ */
+ goto retry;
+ }
+ }
+
+ up_write(&_hash_lock);
+
+ if (dev_skipped)
+ DMWARN("remove_all left %d open device(s)", dev_skipped);
+}
+
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+ mutex_lock(&dm_hash_cells_mutex);
+ hc->uuid = new_uuid;
+ mutex_unlock(&dm_hash_cells_mutex);
+
+ list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+ char *old_name;
+
+ /*
+ * Rename and move the name cell.
+ */
+ list_del(&hc->name_list);
+ old_name = hc->name;
+
+ mutex_lock(&dm_hash_cells_mutex);
+ hc->name = new_name;
+ mutex_unlock(&dm_hash_cells_mutex);
+
+ list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+
+ return old_name;
+}
+
+static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
+ const char *new)
+{
+ char *new_data, *old_name = NULL;
+ struct hash_cell *hc;
+ struct dm_table *table;
+ struct mapped_device *md;
+ unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
+ int srcu_idx;
+
+ /*
+ * duplicate new.
+ */
+ new_data = kstrdup(new, GFP_KERNEL);
+ if (!new_data)
+ return ERR_PTR(-ENOMEM);
+
+ down_write(&_hash_lock);
+
+ /*
+ * Is new free ?
+ */
+ if (change_uuid)
+ hc = __get_uuid_cell(new);
+ else
+ hc = __get_name_cell(new);
+
+ if (hc) {
+ DMWARN("Unable to change %s on mapped device %s to one that "
+ "already exists: %s",
+ change_uuid ? "uuid" : "name",
+ param->name, new);
+ dm_put(hc->md);
+ up_write(&_hash_lock);
+ kfree(new_data);
+ return ERR_PTR(-EBUSY);
+ }
+
+ /*
+ * Is there such a device as 'old' ?
+ */
+ hc = __get_name_cell(param->name);
+ if (!hc) {
+ DMWARN("Unable to rename non-existent device, %s to %s%s",
+ param->name, change_uuid ? "uuid " : "", new);
+ up_write(&_hash_lock);
+ kfree(new_data);
+ return ERR_PTR(-ENXIO);
+ }
+
+ /*
+ * Does this device already have a uuid?
+ */
+ if (change_uuid && hc->uuid) {
+ DMWARN("Unable to change uuid of mapped device %s to %s "
+ "because uuid is already set to %s",
+ param->name, new, hc->uuid);
+ dm_put(hc->md);
+ up_write(&_hash_lock);
+ kfree(new_data);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (change_uuid)
+ __set_cell_uuid(hc, new_data);
+ else
+ old_name = __change_cell_name(hc, new_data);
+
+ /*
+ * Wake up any dm event waiters.
+ */
+ table = dm_get_live_table(hc->md, &srcu_idx);
+ if (table)
+ dm_table_event(table);
+ dm_put_live_table(hc->md, srcu_idx);
+
+ if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr))
+ param->flags |= DM_UEVENT_GENERATED_FLAG;
+
+ md = hc->md;
+ up_write(&_hash_lock);
+ kfree(old_name);
+
+ return md;
+}
+
+void dm_deferred_remove(void)
+{
+ dm_hash_remove_all(true, false, true);
+}
+
+/*-----------------------------------------------------------------
+ * Implementation of the ioctl commands
+ *---------------------------------------------------------------*/
+/*
+ * All the ioctl commands get dispatched to functions with this
+ * prototype.
+ */
+typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
+
+static int remove_all(struct dm_ioctl *param, size_t param_size)
+{
+ dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
+ param->data_size = 0;
+ return 0;
+}
+
+/*
+ * Round up the ptr to an 8-byte boundary.
+ */
+#define ALIGN_MASK 7
+static inline void *align_ptr(void *ptr)
+{
+ return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
+}
+
+/*
+ * Retrieves the data payload buffer from an already allocated
+ * struct dm_ioctl.
+ */
+static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
+ size_t *len)
+{
+ param->data_start = align_ptr(param + 1) - (void *) param;
+
+ if (param->data_start < param_size)
+ *len = param_size - param->data_start;
+ else
+ *len = 0;
+
+ return ((void *) param) + param->data_start;
+}
+
+static int list_devices(struct dm_ioctl *param, size_t param_size)
+{
+ unsigned int i;
+ struct hash_cell *hc;
+ size_t len, needed = 0;
+ struct gendisk *disk;
+ struct dm_name_list *nl, *old_nl = NULL;
+
+ down_write(&_hash_lock);
+
+ /*
+ * Loop through all the devices working out how much
+ * space we need.
+ */
+ for (i = 0; i < NUM_BUCKETS; i++) {
+ list_for_each_entry (hc, _name_buckets + i, name_list) {
+ needed += sizeof(struct dm_name_list);
+ needed += strlen(hc->name) + 1;
+ needed += ALIGN_MASK;
+ }
+ }
+
+ /*
+ * Grab our output buffer.
+ */
+ nl = get_result_buffer(param, param_size, &len);
+ if (len < needed) {
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ goto out;
+ }
+ param->data_size = param->data_start + needed;
+
+ nl->dev = 0; /* Flags no data */
+
+ /*
+ * Now loop through filling out the names.
+ */
+ for (i = 0; i < NUM_BUCKETS; i++) {
+ list_for_each_entry (hc, _name_buckets + i, name_list) {
+ if (old_nl)
+ old_nl->next = (uint32_t) ((void *) nl -
+ (void *) old_nl);
+ disk = dm_disk(hc->md);
+ nl->dev = huge_encode_dev(disk_devt(disk));
+ nl->next = 0;
+ strcpy(nl->name, hc->name);
+
+ old_nl = nl;
+ nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
+ }
+ }
+
+ out:
+ up_write(&_hash_lock);
+ return 0;
+}
+
+static void list_version_get_needed(struct target_type *tt, void *needed_param)
+{
+ size_t *needed = needed_param;
+
+ *needed += sizeof(struct dm_target_versions);
+ *needed += strlen(tt->name);
+ *needed += ALIGN_MASK;
+}
+
+static void list_version_get_info(struct target_type *tt, void *param)
+{
+ struct vers_iter *info = param;
+
+ /* Check space - it might have changed since the first iteration */
+ if ((char *)info->vers + sizeof(tt->version) + strlen(tt->name) + 1 >
+ info->end) {
+
+ info->flags = DM_BUFFER_FULL_FLAG;
+ return;
+ }
+
+ if (info->old_vers)
+ info->old_vers->next = (uint32_t) ((void *)info->vers -
+ (void *)info->old_vers);
+ info->vers->version[0] = tt->version[0];
+ info->vers->version[1] = tt->version[1];
+ info->vers->version[2] = tt->version[2];
+ info->vers->next = 0;
+ strcpy(info->vers->name, tt->name);
+
+ info->old_vers = info->vers;
+ info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
+}
+
+static int list_versions(struct dm_ioctl *param, size_t param_size)
+{
+ size_t len, needed = 0;
+ struct dm_target_versions *vers;
+ struct vers_iter iter_info;
+
+ /*
+ * Loop through all the devices working out how much
+ * space we need.
+ */
+ dm_target_iterate(list_version_get_needed, &needed);
+
+ /*
+ * Grab our output buffer.
+ */
+ vers = get_result_buffer(param, param_size, &len);
+ if (len < needed) {
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ goto out;
+ }
+ param->data_size = param->data_start + needed;
+
+ iter_info.param_size = param_size;
+ iter_info.old_vers = NULL;
+ iter_info.vers = vers;
+ iter_info.flags = 0;
+ iter_info.end = (char *)vers+len;
+
+ /*
+ * Now loop through filling out the names & versions.
+ */
+ dm_target_iterate(list_version_get_info, &iter_info);
+ param->flags |= iter_info.flags;
+
+ out:
+ return 0;
+}
+
+static int check_name(const char *name)
+{
+ if (strchr(name, '/')) {
+ DMWARN("invalid device name");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * On successful return, the caller must not attempt to acquire
+ * _hash_lock without first calling dm_put_live_table, because dm_table_destroy
+ * waits for this dm_put_live_table and could be called under this lock.
+ */
+static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
+{
+ struct hash_cell *hc;
+ struct dm_table *table = NULL;
+
+ /* increment rcu count, we don't care about the table pointer */
+ dm_get_live_table(md, srcu_idx);
+
+ down_read(&_hash_lock);
+ hc = dm_get_mdptr(md);
+ if (!hc || hc->md != md) {
+ DMWARN("device has been removed from the dev hash table.");
+ goto out;
+ }
+
+ table = hc->new_map;
+
+out:
+ up_read(&_hash_lock);
+
+ return table;
+}
+
+static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
+ struct dm_ioctl *param,
+ int *srcu_idx)
+{
+ return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
+ dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx);
+}
+
+/*
+ * Fills in a dm_ioctl structure, ready for sending back to
+ * userland.
+ */
+static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
+{
+ struct gendisk *disk = dm_disk(md);
+ struct dm_table *table;
+ int srcu_idx;
+
+ param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
+ DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG);
+
+ if (dm_suspended_md(md))
+ param->flags |= DM_SUSPEND_FLAG;
+
+ if (dm_suspended_internally_md(md))
+ param->flags |= DM_INTERNAL_SUSPEND_FLAG;
+
+ if (dm_test_deferred_remove_flag(md))
+ param->flags |= DM_DEFERRED_REMOVE;
+
+ param->dev = huge_encode_dev(disk_devt(disk));
+
+ /*
+ * Yes, this will be out of date by the time it gets back
+ * to userland, but it is still very useful for
+ * debugging.
+ */
+ param->open_count = dm_open_count(md);
+
+ param->event_nr = dm_get_event_nr(md);
+ param->target_count = 0;
+
+ table = dm_get_live_table(md, &srcu_idx);
+ if (table) {
+ if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
+ if (get_disk_ro(disk))
+ param->flags |= DM_READONLY_FLAG;
+ param->target_count = dm_table_get_num_targets(table);
+ }
+
+ param->flags |= DM_ACTIVE_PRESENT_FLAG;
+ }
+ dm_put_live_table(md, srcu_idx);
+
+ if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
+ int srcu_idx;
+ table = dm_get_inactive_table(md, &srcu_idx);
+ if (table) {
+ if (!(dm_table_get_mode(table) & FMODE_WRITE))
+ param->flags |= DM_READONLY_FLAG;
+ param->target_count = dm_table_get_num_targets(table);
+ }
+ dm_put_live_table(md, srcu_idx);
+ }
+}
+
+static int dev_create(struct dm_ioctl *param, size_t param_size)
+{
+ int r, m = DM_ANY_MINOR;
+ struct mapped_device *md;
+
+ r = check_name(param->name);
+ if (r)
+ return r;
+
+ if (param->flags & DM_PERSISTENT_DEV_FLAG)
+ m = MINOR(huge_decode_dev(param->dev));
+
+ r = dm_create(m, &md);
+ if (r)
+ return r;
+
+ r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
+ if (r) {
+ dm_put(md);
+ dm_destroy(md);
+ return r;
+ }
+
+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+
+ __dev_status(md, param);
+
+ dm_put(md);
+
+ return 0;
+}
+
+/*
+ * Always use UUID for lookups if it's present, otherwise use name or dev.
+ */
+static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
+{
+ struct hash_cell *hc = NULL;
+
+ if (*param->uuid) {
+ if (*param->name || param->dev)
+ return NULL;
+
+ hc = __get_uuid_cell(param->uuid);
+ if (!hc)
+ return NULL;
+ } else if (*param->name) {
+ if (param->dev)
+ return NULL;
+
+ hc = __get_name_cell(param->name);
+ if (!hc)
+ return NULL;
+ } else if (param->dev) {
+ hc = __get_dev_cell(param->dev);
+ if (!hc)
+ return NULL;
+ } else
+ return NULL;
+
+ /*
+ * Sneakily write in both the name and the uuid
+ * while we have the cell.
+ */
+ strlcpy(param->name, hc->name, sizeof(param->name));
+ if (hc->uuid)
+ strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
+ else
+ param->uuid[0] = '\0';
+
+ if (hc->new_map)
+ param->flags |= DM_INACTIVE_PRESENT_FLAG;
+ else
+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+
+ return hc;
+}
+
+static struct mapped_device *find_device(struct dm_ioctl *param)
+{
+ struct hash_cell *hc;
+ struct mapped_device *md = NULL;
+
+ down_read(&_hash_lock);
+ hc = __find_device_hash_cell(param);
+ if (hc)
+ md = hc->md;
+ up_read(&_hash_lock);
+
+ return md;
+}
+
+static int dev_remove(struct dm_ioctl *param, size_t param_size)
+{
+ struct hash_cell *hc;
+ struct mapped_device *md;
+ int r;
+ struct dm_table *t;
+
+ down_write(&_hash_lock);
+ hc = __find_device_hash_cell(param);
+
+ if (!hc) {
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
+ up_write(&_hash_lock);
+ return -ENXIO;
+ }
+
+ md = hc->md;
+
+ /*
+ * Ensure the device is not open and nothing further can open it.
+ */
+ r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false);
+ if (r) {
+ if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) {
+ up_write(&_hash_lock);
+ dm_put(md);
+ return 0;
+ }
+ DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
+ up_write(&_hash_lock);
+ dm_put(md);
+ return r;
+ }
+
+ t = __hash_remove(hc);
+ up_write(&_hash_lock);
+
+ if (t) {
+ dm_sync_table(md);
+ dm_table_destroy(t);
+ }
+
+ param->flags &= ~DM_DEFERRED_REMOVE;
+
+ if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
+ param->flags |= DM_UEVENT_GENERATED_FLAG;
+
+ dm_put(md);
+ dm_destroy(md);
+ return 0;
+}
+
+/*
+ * Check a string doesn't overrun the chunk of
+ * memory we copied from userland.
+ */
+static int invalid_str(char *str, void *end)
+{
+ while ((void *) str < end)
+ if (!*str++)
+ return 0;
+
+ return -EINVAL;
+}
+
+static int dev_rename(struct dm_ioctl *param, size_t param_size)
+{
+ int r;
+ char *new_data = (char *) param + param->data_start;
+ struct mapped_device *md;
+ unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
+
+ if (new_data < param->data ||
+ invalid_str(new_data, (void *) param + param_size) || !*new_data ||
+ strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
+ DMWARN("Invalid new mapped device name or uuid string supplied.");
+ return -EINVAL;
+ }
+
+ if (!change_uuid) {
+ r = check_name(new_data);
+ if (r)
+ return r;
+ }
+
+ md = dm_hash_rename(param, new_data);
+ if (IS_ERR(md))
+ return PTR_ERR(md);
+
+ __dev_status(md, param);
+ dm_put(md);
+
+ return 0;
+}
+
+static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+{
+ int r = -EINVAL, x;
+ struct mapped_device *md;
+ struct hd_geometry geometry;
+ unsigned long indata[4];
+ char *geostr = (char *) param + param->data_start;
+ char dummy;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ if (geostr < param->data ||
+ invalid_str(geostr, (void *) param + param_size)) {
+ DMWARN("Invalid geometry supplied.");
+ goto out;
+ }
+
+ x = sscanf(geostr, "%lu %lu %lu %lu%c", indata,
+ indata + 1, indata + 2, indata + 3, &dummy);
+
+ if (x != 4) {
+ DMWARN("Unable to interpret geometry settings.");
+ goto out;
+ }
+
+ if (indata[0] > 65535 || indata[1] > 255 ||
+ indata[2] > 255 || indata[3] > ULONG_MAX) {
+ DMWARN("Geometry exceeds range limits.");
+ goto out;
+ }
+
+ geometry.cylinders = indata[0];
+ geometry.heads = indata[1];
+ geometry.sectors = indata[2];
+ geometry.start = indata[3];
+
+ r = dm_set_geometry(md, &geometry);
+
+ param->data_size = 0;
+
+out:
+ dm_put(md);
+ return r;
+}
+
+static int do_suspend(struct dm_ioctl *param)
+{
+ int r = 0;
+ unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
+ struct mapped_device *md;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ if (param->flags & DM_SKIP_LOCKFS_FLAG)
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ if (param->flags & DM_NOFLUSH_FLAG)
+ suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
+
+ if (!dm_suspended_md(md)) {
+ r = dm_suspend(md, suspend_flags);
+ if (r)
+ goto out;
+ }
+
+ __dev_status(md, param);
+
+out:
+ dm_put(md);
+
+ return r;
+}
+
+static int do_resume(struct dm_ioctl *param)
+{
+ int r = 0;
+ unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
+ struct hash_cell *hc;
+ struct mapped_device *md;
+ struct dm_table *new_map, *old_map = NULL;
+
+ down_write(&_hash_lock);
+
+ hc = __find_device_hash_cell(param);
+ if (!hc) {
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
+ up_write(&_hash_lock);
+ return -ENXIO;
+ }
+
+ md = hc->md;
+
+ new_map = hc->new_map;
+ hc->new_map = NULL;
+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+
+ up_write(&_hash_lock);
+
+ /* Do we need to load a new map ? */
+ if (new_map) {
+ /* Suspend if it isn't already suspended */
+ if (param->flags & DM_SKIP_LOCKFS_FLAG)
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ if (param->flags & DM_NOFLUSH_FLAG)
+ suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
+ if (!dm_suspended_md(md))
+ dm_suspend(md, suspend_flags);
+
+ old_map = dm_swap_table(md, new_map);
+ if (IS_ERR(old_map)) {
+ dm_sync_table(md);
+ dm_table_destroy(new_map);
+ dm_put(md);
+ return PTR_ERR(old_map);
+ }
+
+ if (dm_table_get_mode(new_map) & FMODE_WRITE)
+ set_disk_ro(dm_disk(md), 0);
+ else
+ set_disk_ro(dm_disk(md), 1);
+ }
+
+ if (dm_suspended_md(md)) {
+ r = dm_resume(md);
+ if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr))
+ param->flags |= DM_UEVENT_GENERATED_FLAG;
+ }
+
+ /*
+ * Since dm_swap_table synchronizes RCU, nobody should be in
+ * read-side critical section already.
+ */
+ if (old_map)
+ dm_table_destroy(old_map);
+
+ if (!r)
+ __dev_status(md, param);
+
+ dm_put(md);
+ return r;
+}
+
+/*
+ * Set or unset the suspension state of a device.
+ * If the device already is in the requested state we just return its status.
+ */
+static int dev_suspend(struct dm_ioctl *param, size_t param_size)
+{
+ if (param->flags & DM_SUSPEND_FLAG)
+ return do_suspend(param);
+
+ return do_resume(param);
+}
+
+/*
+ * Copies device info back to user space, used by
+ * the create and info ioctls.
+ */
+static int dev_status(struct dm_ioctl *param, size_t param_size)
+{
+ struct mapped_device *md;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ __dev_status(md, param);
+ dm_put(md);
+
+ return 0;
+}
+
+/*
+ * Build up the status struct for each target
+ */
+static void retrieve_status(struct dm_table *table,
+ struct dm_ioctl *param, size_t param_size)
+{
+ unsigned int i, num_targets;
+ struct dm_target_spec *spec;
+ char *outbuf, *outptr;
+ status_type_t type;
+ size_t remaining, len, used = 0;
+ unsigned status_flags = 0;
+
+ outptr = outbuf = get_result_buffer(param, param_size, &len);
+
+ if (param->flags & DM_STATUS_TABLE_FLAG)
+ type = STATUSTYPE_TABLE;
+ else
+ type = STATUSTYPE_INFO;
+
+ /* Get all the target info */
+ num_targets = dm_table_get_num_targets(table);
+ for (i = 0; i < num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(table, i);
+ size_t l;
+
+ remaining = len - (outptr - outbuf);
+ if (remaining <= sizeof(struct dm_target_spec)) {
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ break;
+ }
+
+ spec = (struct dm_target_spec *) outptr;
+
+ spec->status = 0;
+ spec->sector_start = ti->begin;
+ spec->length = ti->len;
+ strncpy(spec->target_type, ti->type->name,
+ sizeof(spec->target_type));
+
+ outptr += sizeof(struct dm_target_spec);
+ remaining = len - (outptr - outbuf);
+ if (remaining <= 0) {
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ break;
+ }
+
+ /* Get the status/table string from the target driver */
+ if (ti->type->status) {
+ if (param->flags & DM_NOFLUSH_FLAG)
+ status_flags |= DM_STATUS_NOFLUSH_FLAG;
+ ti->type->status(ti, type, status_flags, outptr, remaining);
+ } else
+ outptr[0] = '\0';
+
+ l = strlen(outptr) + 1;
+ if (l == remaining) {
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ break;
+ }
+
+ outptr += l;
+ used = param->data_start + (outptr - outbuf);
+
+ outptr = align_ptr(outptr);
+ spec->next = outptr - outbuf;
+ }
+
+ if (used)
+ param->data_size = used;
+
+ param->target_count = num_targets;
+}
+
+/*
+ * Wait for a device to report an event
+ */
+static int dev_wait(struct dm_ioctl *param, size_t param_size)
+{
+ int r = 0;
+ struct mapped_device *md;
+ struct dm_table *table;
+ int srcu_idx;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ /*
+ * Wait for a notification event
+ */
+ if (dm_wait_event(md, param->event_nr)) {
+ r = -ERESTARTSYS;
+ goto out;
+ }
+
+ /*
+ * The userland program is going to want to know what
+ * changed to trigger the event, so we may as well tell
+ * him and save an ioctl.
+ */
+ __dev_status(md, param);
+
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
+ retrieve_status(table, param, param_size);
+ dm_put_live_table(md, srcu_idx);
+
+out:
+ dm_put(md);
+
+ return r;
+}
+
+static inline fmode_t get_mode(struct dm_ioctl *param)
+{
+ fmode_t mode = FMODE_READ | FMODE_WRITE;
+
+ if (param->flags & DM_READONLY_FLAG)
+ mode = FMODE_READ;
+
+ return mode;
+}
+
+static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
+ struct dm_target_spec **spec, char **target_params)
+{
+ *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
+ *target_params = (char *) (*spec + 1);
+
+ if (*spec < (last + 1))
+ return -EINVAL;
+
+ return invalid_str(*target_params, end);
+}
+
+static int populate_table(struct dm_table *table,
+ struct dm_ioctl *param, size_t param_size)
+{
+ int r;
+ unsigned int i = 0;
+ struct dm_target_spec *spec = (struct dm_target_spec *) param;
+ uint32_t next = param->data_start;
+ void *end = (void *) param + param_size;
+ char *target_params;
+
+ if (!param->target_count) {
+ DMWARN("populate_table: no targets specified");
+ return -EINVAL;
+ }
+
+ for (i = 0; i < param->target_count; i++) {
+
+ r = next_target(spec, next, end, &spec, &target_params);
+ if (r) {
+ DMWARN("unable to find target");
+ return r;
+ }
+
+ r = dm_table_add_target(table, spec->target_type,
+ (sector_t) spec->sector_start,
+ (sector_t) spec->length,
+ target_params);
+ if (r) {
+ DMWARN("error adding target to table");
+ return r;
+ }
+
+ next = spec->next;
+ }
+
+ return dm_table_complete(table);
+}
+
+static int table_load(struct dm_ioctl *param, size_t param_size)
+{
+ int r;
+ struct hash_cell *hc;
+ struct dm_table *t, *old_map = NULL;
+ struct mapped_device *md;
+ struct target_type *immutable_target_type;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ r = dm_table_create(&t, get_mode(param), param->target_count, md);
+ if (r)
+ goto err;
+
+ /* Protect md->type and md->queue against concurrent table loads. */
+ dm_lock_md_type(md);
+ r = populate_table(t, param, param_size);
+ if (r)
+ goto err_unlock_md_type;
+
+ immutable_target_type = dm_get_immutable_target_type(md);
+ if (immutable_target_type &&
+ (immutable_target_type != dm_table_get_immutable_target_type(t))) {
+ DMWARN("can't replace immutable target type %s",
+ immutable_target_type->name);
+ r = -EINVAL;
+ goto err_unlock_md_type;
+ }
+
+ if (dm_get_md_type(md) == DM_TYPE_NONE) {
+ /* Initial table load: acquire type of table. */
+ dm_set_md_type(md, dm_table_get_type(t));
+
+ /* setup md->queue to reflect md's type (may block) */
+ r = dm_setup_md_queue(md);
+ if (r) {
+ DMWARN("unable to set up device queue for new table.");
+ goto err_unlock_md_type;
+ }
+ } else if (dm_get_md_type(md) != dm_table_get_type(t)) {
+ DMWARN("can't change device type after initial table load.");
+ r = -EINVAL;
+ goto err_unlock_md_type;
+ }
+
+ dm_unlock_md_type(md);
+
+ /* stage inactive table */
+ down_write(&_hash_lock);
+ hc = dm_get_mdptr(md);
+ if (!hc || hc->md != md) {
+ DMWARN("device has been removed from the dev hash table.");
+ up_write(&_hash_lock);
+ r = -ENXIO;
+ goto err_destroy_table;
+ }
+
+ if (hc->new_map)
+ old_map = hc->new_map;
+ hc->new_map = t;
+ up_write(&_hash_lock);
+
+ param->flags |= DM_INACTIVE_PRESENT_FLAG;
+ __dev_status(md, param);
+
+ if (old_map) {
+ dm_sync_table(md);
+ dm_table_destroy(old_map);
+ }
+
+ dm_put(md);
+
+ return 0;
+
+err_unlock_md_type:
+ dm_unlock_md_type(md);
+err_destroy_table:
+ dm_table_destroy(t);
+err:
+ dm_put(md);
+
+ return r;
+}
+
+static int table_clear(struct dm_ioctl *param, size_t param_size)
+{
+ struct hash_cell *hc;
+ struct mapped_device *md;
+ struct dm_table *old_map = NULL;
+
+ down_write(&_hash_lock);
+
+ hc = __find_device_hash_cell(param);
+ if (!hc) {
+ DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
+ up_write(&_hash_lock);
+ return -ENXIO;
+ }
+
+ if (hc->new_map) {
+ old_map = hc->new_map;
+ hc->new_map = NULL;
+ }
+
+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+
+ __dev_status(hc->md, param);
+ md = hc->md;
+ up_write(&_hash_lock);
+ if (old_map) {
+ dm_sync_table(md);
+ dm_table_destroy(old_map);
+ }
+ dm_put(md);
+
+ return 0;
+}
+
+/*
+ * Retrieves a list of devices used by a particular dm device.
+ */
+static void retrieve_deps(struct dm_table *table,
+ struct dm_ioctl *param, size_t param_size)
+{
+ unsigned int count = 0;
+ struct list_head *tmp;
+ size_t len, needed;
+ struct dm_dev_internal *dd;
+ struct dm_target_deps *deps;
+
+ deps = get_result_buffer(param, param_size, &len);
+
+ /*
+ * Count the devices.
+ */
+ list_for_each (tmp, dm_table_get_devices(table))
+ count++;
+
+ /*
+ * Check we have enough space.
+ */
+ needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
+ if (len < needed) {
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ return;
+ }
+
+ /*
+ * Fill in the devices.
+ */
+ deps->count = count;
+ count = 0;
+ list_for_each_entry (dd, dm_table_get_devices(table), list)
+ deps->dev[count++] = huge_encode_dev(dd->dm_dev->bdev->bd_dev);
+
+ param->data_size = param->data_start + needed;
+}
+
+static int table_deps(struct dm_ioctl *param, size_t param_size)
+{
+ struct mapped_device *md;
+ struct dm_table *table;
+ int srcu_idx;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ __dev_status(md, param);
+
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
+ retrieve_deps(table, param, param_size);
+ dm_put_live_table(md, srcu_idx);
+
+ dm_put(md);
+
+ return 0;
+}
+
+/*
+ * Return the status of a device as a text string for each
+ * target.
+ */
+static int table_status(struct dm_ioctl *param, size_t param_size)
+{
+ struct mapped_device *md;
+ struct dm_table *table;
+ int srcu_idx;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ __dev_status(md, param);
+
+ table = dm_get_live_or_inactive_table(md, param, &srcu_idx);
+ if (table)
+ retrieve_status(table, param, param_size);
+ dm_put_live_table(md, srcu_idx);
+
+ dm_put(md);
+
+ return 0;
+}
+
+/*
+ * Process device-mapper dependent messages. Messages prefixed with '@'
+ * are processed by the DM core. All others are delivered to the target.
+ * Returns a number <= 1 if message was processed by device mapper.
+ * Returns 2 if message should be delivered to the target.
+ */
+static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
+ char *result, unsigned maxlen)
+{
+ int r;
+
+ if (**argv != '@')
+ return 2; /* no '@' prefix, deliver to target */
+
+ if (!strcasecmp(argv[0], "@cancel_deferred_remove")) {
+ if (argc != 1) {
+ DMERR("Invalid arguments for @cancel_deferred_remove");
+ return -EINVAL;
+ }
+ return dm_cancel_deferred_remove(md);
+ }
+
+ r = dm_stats_message(md, argc, argv, result, maxlen);
+ if (r < 2)
+ return r;
+
+ DMERR("Unsupported message sent to DM core: %s", argv[0]);
+ return -EINVAL;
+}
+
+/*
+ * Pass a message to the target that's at the supplied device offset.
+ */
+static int target_message(struct dm_ioctl *param, size_t param_size)
+{
+ int r, argc;
+ char **argv;
+ struct mapped_device *md;
+ struct dm_table *table;
+ struct dm_target *ti;
+ struct dm_target_msg *tmsg = (void *) param + param->data_start;
+ size_t maxlen;
+ char *result = get_result_buffer(param, param_size, &maxlen);
+ int srcu_idx;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ if (tmsg < (struct dm_target_msg *) param->data ||
+ invalid_str(tmsg->message, (void *) param + param_size)) {
+ DMWARN("Invalid target message parameters.");
+ r = -EINVAL;
+ goto out;
+ }
+
+ r = dm_split_args(&argc, &argv, tmsg->message);
+ if (r) {
+ DMWARN("Failed to split target message parameters");
+ goto out;
+ }
+
+ if (!argc) {
+ DMWARN("Empty message received.");
+ goto out_argv;
+ }
+
+ r = message_for_md(md, argc, argv, result, maxlen);
+ if (r <= 1)
+ goto out_argv;
+
+ table = dm_get_live_table(md, &srcu_idx);
+ if (!table)
+ goto out_table;
+
+ if (dm_deleting_md(md)) {
+ r = -ENXIO;
+ goto out_table;
+ }
+
+ ti = dm_table_find_target(table, tmsg->sector);
+ if (!dm_target_is_valid(ti)) {
+ DMWARN("Target message sector outside device.");
+ r = -EINVAL;
+ } else if (ti->type->message)
+ r = ti->type->message(ti, argc, argv);
+ else {
+ DMWARN("Target type does not support messages");
+ r = -EINVAL;
+ }
+
+ out_table:
+ dm_put_live_table(md, srcu_idx);
+ out_argv:
+ kfree(argv);
+ out:
+ if (r >= 0)
+ __dev_status(md, param);
+
+ if (r == 1) {
+ param->flags |= DM_DATA_OUT_FLAG;
+ if (dm_message_test_buffer_overflow(result, maxlen))
+ param->flags |= DM_BUFFER_FULL_FLAG;
+ else
+ param->data_size = param->data_start + strlen(result) + 1;
+ r = 0;
+ }
+
+ dm_put(md);
+ return r;
+}
+
+/*
+ * The ioctl parameter block consists of two parts, a dm_ioctl struct
+ * followed by a data buffer. This flag is set if the second part,
+ * which has a variable size, is not used by the function processing
+ * the ioctl.
+ */
+#define IOCTL_FLAGS_NO_PARAMS 1
+
+/*-----------------------------------------------------------------
+ * Implementation of open/close/ioctl on the special char
+ * device.
+ *---------------------------------------------------------------*/
+static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
+{
+ static struct {
+ int cmd;
+ int flags;
+ ioctl_fn fn;
+ } _ioctls[] = {
+ {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */
+ {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all},
+ {DM_LIST_DEVICES_CMD, 0, list_devices},
+
+ {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create},
+ {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove},
+ {DM_DEV_RENAME_CMD, 0, dev_rename},
+ {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend},
+ {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status},
+ {DM_DEV_WAIT_CMD, 0, dev_wait},
+
+ {DM_TABLE_LOAD_CMD, 0, table_load},
+ {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear},
+ {DM_TABLE_DEPS_CMD, 0, table_deps},
+ {DM_TABLE_STATUS_CMD, 0, table_status},
+
+ {DM_LIST_VERSIONS_CMD, 0, list_versions},
+
+ {DM_TARGET_MSG_CMD, 0, target_message},
+ {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
+ };
+
+ if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
+ return NULL;
+
+ *ioctl_flags = _ioctls[cmd].flags;
+ return _ioctls[cmd].fn;
+}
+
+/*
+ * As well as checking the version compatibility this always
+ * copies the kernel interface version out.
+ */
+static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
+{
+ uint32_t version[3];
+ int r = 0;
+
+ if (copy_from_user(version, user->version, sizeof(version)))
+ return -EFAULT;
+
+ if ((DM_VERSION_MAJOR != version[0]) ||
+ (DM_VERSION_MINOR < version[1])) {
+ DMWARN("ioctl interface mismatch: "
+ "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
+ DM_VERSION_MAJOR, DM_VERSION_MINOR,
+ DM_VERSION_PATCHLEVEL,
+ version[0], version[1], version[2], cmd);
+ r = -EINVAL;
+ }
+
+ /*
+ * Fill in the kernel version.
+ */
+ version[0] = DM_VERSION_MAJOR;
+ version[1] = DM_VERSION_MINOR;
+ version[2] = DM_VERSION_PATCHLEVEL;
+ if (copy_to_user(user->version, version, sizeof(version)))
+ return -EFAULT;
+
+ return r;
+}
+
+#define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */
+#define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */
+#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */
+
+static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
+{
+ if (param_flags & DM_WIPE_BUFFER)
+ memset(param, 0, param_size);
+
+ if (param_flags & DM_PARAMS_KMALLOC)
+ kfree(param);
+ if (param_flags & DM_PARAMS_VMALLOC)
+ vfree(param);
+}
+
+static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
+ int ioctl_flags,
+ struct dm_ioctl **param, int *param_flags)
+{
+ struct dm_ioctl *dmi;
+ int secure_data;
+ const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data);
+
+ if (copy_from_user(param_kernel, user, minimum_data_size))
+ return -EFAULT;
+
+ if (param_kernel->data_size < minimum_data_size)
+ return -EINVAL;
+
+ secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG;
+
+ *param_flags = secure_data ? DM_WIPE_BUFFER : 0;
+
+ if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) {
+ dmi = param_kernel;
+ dmi->data_size = minimum_data_size;
+ goto data_copied;
+ }
+
+ /*
+ * Try to avoid low memory issues when a device is suspended.
+ * Use kmalloc() rather than vmalloc() when we can.
+ */
+ dmi = NULL;
+ if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
+ dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (dmi)
+ *param_flags |= DM_PARAMS_KMALLOC;
+ }
+
+ if (!dmi) {
+ unsigned noio_flag;
+ noio_flag = memalloc_noio_save();
+ dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
+ memalloc_noio_restore(noio_flag);
+ if (dmi)
+ *param_flags |= DM_PARAMS_VMALLOC;
+ }
+
+ if (!dmi) {
+ if (secure_data && clear_user(user, param_kernel->data_size))
+ return -EFAULT;
+ return -ENOMEM;
+ }
+
+ if (copy_from_user(dmi, user, param_kernel->data_size))
+ goto bad;
+
+data_copied:
+ /*
+ * Abort if something changed the ioctl data while it was being copied.
+ */
+ if (dmi->data_size != param_kernel->data_size) {
+ DMERR("rejecting ioctl: data size modified while processing parameters");
+ goto bad;
+ }
+
+ /* Wipe the user buffer so we do not return it to userspace */
+ if (secure_data && clear_user(user, param_kernel->data_size))
+ goto bad;
+
+ *param = dmi;
+ return 0;
+
+bad:
+ free_params(dmi, param_kernel->data_size, *param_flags);
+
+ return -EFAULT;
+}
+
+static int validate_params(uint cmd, struct dm_ioctl *param)
+{
+ /* Always clear this flag */
+ param->flags &= ~DM_BUFFER_FULL_FLAG;
+ param->flags &= ~DM_UEVENT_GENERATED_FLAG;
+ param->flags &= ~DM_SECURE_DATA_FLAG;
+ param->flags &= ~DM_DATA_OUT_FLAG;
+
+ /* Ignores parameters */
+ if (cmd == DM_REMOVE_ALL_CMD ||
+ cmd == DM_LIST_DEVICES_CMD ||
+ cmd == DM_LIST_VERSIONS_CMD)
+ return 0;
+
+ if ((cmd == DM_DEV_CREATE_CMD)) {
+ if (!*param->name) {
+ DMWARN("name not supplied when creating device");
+ return -EINVAL;
+ }
+ } else if ((*param->uuid && *param->name)) {
+ DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
+ return -EINVAL;
+ }
+
+ /* Ensure strings are terminated */
+ param->name[DM_NAME_LEN - 1] = '\0';
+ param->uuid[DM_UUID_LEN - 1] = '\0';
+
+ return 0;
+}
+
+static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
+{
+ int r = 0;
+ int ioctl_flags;
+ int param_flags;
+ unsigned int cmd;
+ struct dm_ioctl *uninitialized_var(param);
+ ioctl_fn fn = NULL;
+ size_t input_param_size;
+ struct dm_ioctl param_kernel;
+
+ /* only root can play with this */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (_IOC_TYPE(command) != DM_IOCTL)
+ return -ENOTTY;
+
+ cmd = _IOC_NR(command);
+
+ /*
+ * Check the interface version passed in. This also
+ * writes out the kernel's interface version.
+ */
+ r = check_version(cmd, user);
+ if (r)
+ return r;
+
+ /*
+ * Nothing more to do for the version command.
+ */
+ if (cmd == DM_VERSION_CMD)
+ return 0;
+
+ fn = lookup_ioctl(cmd, &ioctl_flags);
+ if (!fn) {
+ DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
+ return -ENOTTY;
+ }
+
+ /*
+ * Copy the parameters into kernel space.
+ */
+ r = copy_params(user, &param_kernel, ioctl_flags, &param, &param_flags);
+
+ if (r)
+ return r;
+
+ input_param_size = param->data_size;
+ r = validate_params(cmd, param);
+ if (r)
+ goto out;
+
+ param->data_size = sizeof(*param);
+ r = fn(param, input_param_size);
+
+ if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
+ unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
+ DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd);
+
+ /*
+ * Copy the results back to userland.
+ */
+ if (!r && copy_to_user(user, param, param->data_size))
+ r = -EFAULT;
+
+out:
+ free_params(param, input_param_size, param_flags);
+ return r;
+}
+
+static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
+{
+ return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
+}
+
+#ifdef CONFIG_COMPAT
+static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
+{
+ return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u));
+}
+#else
+#define dm_compat_ctl_ioctl NULL
+#endif
+
+static const struct file_operations _ctl_fops = {
+ .open = nonseekable_open,
+ .unlocked_ioctl = dm_ctl_ioctl,
+ .compat_ioctl = dm_compat_ctl_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice _dm_misc = {
+ .minor = MAPPER_CTRL_MINOR,
+ .name = DM_NAME,
+ .nodename = DM_DIR "/" DM_CONTROL_NODE,
+ .fops = &_ctl_fops
+};
+
+MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR);
+MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE);
+
+/*
+ * Create misc character device and link to DM_DIR/control.
+ */
+int __init dm_interface_init(void)
+{
+ int r;
+
+ r = dm_hash_init();
+ if (r)
+ return r;
+
+ r = misc_register(&_dm_misc);
+ if (r) {
+ DMERR("misc_register failed for control device");
+ dm_hash_exit();
+ return r;
+ }
+
+ DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
+ DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
+ DM_DRIVER_EMAIL);
+ return 0;
+}
+
+void dm_interface_exit(void)
+{
+ if (misc_deregister(&_dm_misc) < 0)
+ DMERR("misc_deregister failed for control device");
+
+ dm_hash_exit();
+}
+
+/**
+ * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers
+ * @md: Pointer to mapped_device
+ * @name: Buffer (size DM_NAME_LEN) for name
+ * @uuid: Buffer (size DM_UUID_LEN) for uuid or empty string if uuid not defined
+ */
+int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
+{
+ int r = 0;
+ struct hash_cell *hc;
+
+ if (!md)
+ return -ENXIO;
+
+ mutex_lock(&dm_hash_cells_mutex);
+ hc = dm_get_mdptr(md);
+ if (!hc || hc->md != md) {
+ r = -ENXIO;
+ goto out;
+ }
+
+ if (name)
+ strcpy(name, hc->name);
+ if (uuid)
+ strcpy(uuid, hc->uuid ? : "");
+
+out:
+ mutex_unlock(&dm_hash_cells_mutex);
+
+ return r;
+}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
new file mode 100644
index 000000000..3a7cade5e
--- /dev/null
+++ b/drivers/md/dm-kcopyd.c
@@ -0,0 +1,884 @@
+/*
+ * Copyright (C) 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ *
+ * Kcopyd provides a simple interface for copying an area of one
+ * block-device to one or more other block-devices, with an asynchronous
+ * completion notification.
+ */
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-kcopyd.h>
+
+#include "dm.h"
+
+#define SUB_JOB_SIZE 128
+#define SPLIT_COUNT 8
+#define MIN_JOBS 8
+#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE))
+
+/*-----------------------------------------------------------------
+ * Each kcopyd client has its own little pool of preallocated
+ * pages for kcopyd io.
+ *---------------------------------------------------------------*/
+struct dm_kcopyd_client {
+ struct page_list *pages;
+ unsigned nr_reserved_pages;
+ unsigned nr_free_pages;
+
+ struct dm_io_client *io_client;
+
+ wait_queue_head_t destroyq;
+ atomic_t nr_jobs;
+
+ mempool_t *job_pool;
+
+ struct workqueue_struct *kcopyd_wq;
+ struct work_struct kcopyd_work;
+
+ struct dm_kcopyd_throttle *throttle;
+
+/*
+ * We maintain three lists of jobs:
+ *
+ * i) jobs waiting for pages
+ * ii) jobs that have pages, and are waiting for the io to be issued.
+ * iii) jobs that have completed.
+ *
+ * All three of these are protected by job_lock.
+ */
+ spinlock_t job_lock;
+ struct list_head complete_jobs;
+ struct list_head io_jobs;
+ struct list_head pages_jobs;
+};
+
+static struct page_list zero_page_list;
+
+static DEFINE_SPINLOCK(throttle_spinlock);
+
+/*
+ * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period.
+ * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided
+ * by 2.
+ */
+#define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ
+
+/*
+ * Sleep this number of milliseconds.
+ *
+ * The value was decided experimentally.
+ * Smaller values seem to cause an increased copy rate above the limit.
+ * The reason for this is unknown but possibly due to jiffies rounding errors
+ * or read/write cache inside the disk.
+ */
+#define SLEEP_MSEC 100
+
+/*
+ * Maximum number of sleep events. There is a theoretical livelock if more
+ * kcopyd clients do work simultaneously which this limit avoids.
+ */
+#define MAX_SLEEPS 10
+
+static void io_job_start(struct dm_kcopyd_throttle *t)
+{
+ unsigned throttle, now, difference;
+ int slept = 0, skew;
+
+ if (unlikely(!t))
+ return;
+
+try_again:
+ spin_lock_irq(&throttle_spinlock);
+
+ throttle = ACCESS_ONCE(t->throttle);
+
+ if (likely(throttle >= 100))
+ goto skip_limit;
+
+ now = jiffies;
+ difference = now - t->last_jiffies;
+ t->last_jiffies = now;
+ if (t->num_io_jobs)
+ t->io_period += difference;
+ t->total_period += difference;
+
+ /*
+ * Maintain sane values if we got a temporary overflow.
+ */
+ if (unlikely(t->io_period > t->total_period))
+ t->io_period = t->total_period;
+
+ if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) {
+ int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT);
+ t->total_period >>= shift;
+ t->io_period >>= shift;
+ }
+
+ skew = t->io_period - throttle * t->total_period / 100;
+
+ if (unlikely(skew > 0) && slept < MAX_SLEEPS) {
+ slept++;
+ spin_unlock_irq(&throttle_spinlock);
+ msleep(SLEEP_MSEC);
+ goto try_again;
+ }
+
+skip_limit:
+ t->num_io_jobs++;
+
+ spin_unlock_irq(&throttle_spinlock);
+}
+
+static void io_job_finish(struct dm_kcopyd_throttle *t)
+{
+ unsigned long flags;
+
+ if (unlikely(!t))
+ return;
+
+ spin_lock_irqsave(&throttle_spinlock, flags);
+
+ t->num_io_jobs--;
+
+ if (likely(ACCESS_ONCE(t->throttle) >= 100))
+ goto skip_limit;
+
+ if (!t->num_io_jobs) {
+ unsigned now, difference;
+
+ now = jiffies;
+ difference = now - t->last_jiffies;
+ t->last_jiffies = now;
+
+ t->io_period += difference;
+ t->total_period += difference;
+
+ /*
+ * Maintain sane values if we got a temporary overflow.
+ */
+ if (unlikely(t->io_period > t->total_period))
+ t->io_period = t->total_period;
+ }
+
+skip_limit:
+ spin_unlock_irqrestore(&throttle_spinlock, flags);
+}
+
+
+static void wake(struct dm_kcopyd_client *kc)
+{
+ queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
+}
+
+/*
+ * Obtain one page for the use of kcopyd.
+ */
+static struct page_list *alloc_pl(gfp_t gfp)
+{
+ struct page_list *pl;
+
+ pl = kmalloc(sizeof(*pl), gfp);
+ if (!pl)
+ return NULL;
+
+ pl->page = alloc_page(gfp);
+ if (!pl->page) {
+ kfree(pl);
+ return NULL;
+ }
+
+ return pl;
+}
+
+static void free_pl(struct page_list *pl)
+{
+ __free_page(pl->page);
+ kfree(pl);
+}
+
+/*
+ * Add the provided pages to a client's free page list, releasing
+ * back to the system any beyond the reserved_pages limit.
+ */
+static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
+{
+ struct page_list *next;
+
+ do {
+ next = pl->next;
+
+ if (kc->nr_free_pages >= kc->nr_reserved_pages)
+ free_pl(pl);
+ else {
+ pl->next = kc->pages;
+ kc->pages = pl;
+ kc->nr_free_pages++;
+ }
+
+ pl = next;
+ } while (pl);
+}
+
+static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
+ unsigned int nr, struct page_list **pages)
+{
+ struct page_list *pl;
+
+ *pages = NULL;
+
+ do {
+ pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY);
+ if (unlikely(!pl)) {
+ /* Use reserved pages */
+ pl = kc->pages;
+ if (unlikely(!pl))
+ goto out_of_memory;
+ kc->pages = pl->next;
+ kc->nr_free_pages--;
+ }
+ pl->next = *pages;
+ *pages = pl;
+ } while (--nr);
+
+ return 0;
+
+out_of_memory:
+ if (*pages)
+ kcopyd_put_pages(kc, *pages);
+ return -ENOMEM;
+}
+
+/*
+ * These three functions resize the page pool.
+ */
+static void drop_pages(struct page_list *pl)
+{
+ struct page_list *next;
+
+ while (pl) {
+ next = pl->next;
+ free_pl(pl);
+ pl = next;
+ }
+}
+
+/*
+ * Allocate and reserve nr_pages for the use of a specific client.
+ */
+static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages)
+{
+ unsigned i;
+ struct page_list *pl = NULL, *next;
+
+ for (i = 0; i < nr_pages; i++) {
+ next = alloc_pl(GFP_KERNEL);
+ if (!next) {
+ if (pl)
+ drop_pages(pl);
+ return -ENOMEM;
+ }
+ next->next = pl;
+ pl = next;
+ }
+
+ kc->nr_reserved_pages += nr_pages;
+ kcopyd_put_pages(kc, pl);
+
+ return 0;
+}
+
+static void client_free_pages(struct dm_kcopyd_client *kc)
+{
+ BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages);
+ drop_pages(kc->pages);
+ kc->pages = NULL;
+ kc->nr_free_pages = kc->nr_reserved_pages = 0;
+}
+
+/*-----------------------------------------------------------------
+ * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
+ * for this reason we use a mempool to prevent the client from
+ * ever having to do io (which could cause a deadlock).
+ *---------------------------------------------------------------*/
+struct kcopyd_job {
+ struct dm_kcopyd_client *kc;
+ struct list_head list;
+ unsigned long flags;
+
+ /*
+ * Error state of the job.
+ */
+ int read_err;
+ unsigned long write_err;
+
+ /*
+ * Either READ or WRITE
+ */
+ int rw;
+ struct dm_io_region source;
+
+ /*
+ * The destinations for the transfer.
+ */
+ unsigned int num_dests;
+ struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
+
+ struct page_list *pages;
+
+ /*
+ * Set this to ensure you are notified when the job has
+ * completed. 'context' is for callback to use.
+ */
+ dm_kcopyd_notify_fn fn;
+ void *context;
+
+ /*
+ * These fields are only used if the job has been split
+ * into more manageable parts.
+ */
+ struct mutex lock;
+ atomic_t sub_jobs;
+ sector_t progress;
+
+ struct kcopyd_job *master_job;
+};
+
+static struct kmem_cache *_job_cache;
+
+int __init dm_kcopyd_init(void)
+{
+ _job_cache = kmem_cache_create("kcopyd_job",
+ sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
+ __alignof__(struct kcopyd_job), 0, NULL);
+ if (!_job_cache)
+ return -ENOMEM;
+
+ zero_page_list.next = &zero_page_list;
+ zero_page_list.page = ZERO_PAGE(0);
+
+ return 0;
+}
+
+void dm_kcopyd_exit(void)
+{
+ kmem_cache_destroy(_job_cache);
+ _job_cache = NULL;
+}
+
+/*
+ * Functions to push and pop a job onto the head of a given job
+ * list.
+ */
+static struct kcopyd_job *pop(struct list_head *jobs,
+ struct dm_kcopyd_client *kc)
+{
+ struct kcopyd_job *job = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kc->job_lock, flags);
+
+ if (!list_empty(jobs)) {
+ job = list_entry(jobs->next, struct kcopyd_job, list);
+ list_del(&job->list);
+ }
+ spin_unlock_irqrestore(&kc->job_lock, flags);
+
+ return job;
+}
+
+static void push(struct list_head *jobs, struct kcopyd_job *job)
+{
+ unsigned long flags;
+ struct dm_kcopyd_client *kc = job->kc;
+
+ spin_lock_irqsave(&kc->job_lock, flags);
+ list_add_tail(&job->list, jobs);
+ spin_unlock_irqrestore(&kc->job_lock, flags);
+}
+
+
+static void push_head(struct list_head *jobs, struct kcopyd_job *job)
+{
+ unsigned long flags;
+ struct dm_kcopyd_client *kc = job->kc;
+
+ spin_lock_irqsave(&kc->job_lock, flags);
+ list_add(&job->list, jobs);
+ spin_unlock_irqrestore(&kc->job_lock, flags);
+}
+
+/*
+ * These three functions process 1 item from the corresponding
+ * job list.
+ *
+ * They return:
+ * < 0: error
+ * 0: success
+ * > 0: can't process yet.
+ */
+static int run_complete_job(struct kcopyd_job *job)
+{
+ void *context = job->context;
+ int read_err = job->read_err;
+ unsigned long write_err = job->write_err;
+ dm_kcopyd_notify_fn fn = job->fn;
+ struct dm_kcopyd_client *kc = job->kc;
+
+ if (job->pages && job->pages != &zero_page_list)
+ kcopyd_put_pages(kc, job->pages);
+ /*
+ * If this is the master job, the sub jobs have already
+ * completed so we can free everything.
+ */
+ if (job->master_job == job)
+ mempool_free(job, kc->job_pool);
+ fn(read_err, write_err, context);
+
+ if (atomic_dec_and_test(&kc->nr_jobs))
+ wake_up(&kc->destroyq);
+
+ return 0;
+}
+
+static void complete_io(unsigned long error, void *context)
+{
+ struct kcopyd_job *job = (struct kcopyd_job *) context;
+ struct dm_kcopyd_client *kc = job->kc;
+
+ io_job_finish(kc->throttle);
+
+ if (error) {
+ if (job->rw & WRITE)
+ job->write_err |= error;
+ else
+ job->read_err = 1;
+
+ if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
+ push(&kc->complete_jobs, job);
+ wake(kc);
+ return;
+ }
+ }
+
+ if (job->rw & WRITE)
+ push(&kc->complete_jobs, job);
+
+ else {
+ job->rw = WRITE;
+ push(&kc->io_jobs, job);
+ }
+
+ wake(kc);
+}
+
+/*
+ * Request io on as many buffer heads as we can currently get for
+ * a particular job.
+ */
+static int run_io_job(struct kcopyd_job *job)
+{
+ int r;
+ struct dm_io_request io_req = {
+ .bi_rw = job->rw,
+ .mem.type = DM_IO_PAGE_LIST,
+ .mem.ptr.pl = job->pages,
+ .mem.offset = 0,
+ .notify.fn = complete_io,
+ .notify.context = job,
+ .client = job->kc->io_client,
+ };
+
+ io_job_start(job->kc->throttle);
+
+ if (job->rw == READ)
+ r = dm_io(&io_req, 1, &job->source, NULL);
+ else
+ r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+
+ return r;
+}
+
+static int run_pages_job(struct kcopyd_job *job)
+{
+ int r;
+ unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
+
+ r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
+ if (!r) {
+ /* this job is ready for io */
+ push(&job->kc->io_jobs, job);
+ return 0;
+ }
+
+ if (r == -ENOMEM)
+ /* can't complete now */
+ return 1;
+
+ return r;
+}
+
+/*
+ * Run through a list for as long as possible. Returns the count
+ * of successful jobs.
+ */
+static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
+ int (*fn) (struct kcopyd_job *))
+{
+ struct kcopyd_job *job;
+ int r, count = 0;
+
+ while ((job = pop(jobs, kc))) {
+
+ r = fn(job);
+
+ if (r < 0) {
+ /* error this rogue job */
+ if (job->rw & WRITE)
+ job->write_err = (unsigned long) -1L;
+ else
+ job->read_err = 1;
+ push(&kc->complete_jobs, job);
+ break;
+ }
+
+ if (r > 0) {
+ /*
+ * We couldn't service this job ATM, so
+ * push this job back onto the list.
+ */
+ push_head(jobs, job);
+ break;
+ }
+
+ count++;
+ }
+
+ return count;
+}
+
+/*
+ * kcopyd does this every time it's woken up.
+ */
+static void do_work(struct work_struct *work)
+{
+ struct dm_kcopyd_client *kc = container_of(work,
+ struct dm_kcopyd_client, kcopyd_work);
+ struct blk_plug plug;
+
+ /*
+ * The order that these are called is *very* important.
+ * complete jobs can free some pages for pages jobs.
+ * Pages jobs when successful will jump onto the io jobs
+ * list. io jobs call wake when they complete and it all
+ * starts again.
+ */
+ blk_start_plug(&plug);
+ process_jobs(&kc->complete_jobs, kc, run_complete_job);
+ process_jobs(&kc->pages_jobs, kc, run_pages_job);
+ process_jobs(&kc->io_jobs, kc, run_io_job);
+ blk_finish_plug(&plug);
+}
+
+/*
+ * If we are copying a small region we just dispatch a single job
+ * to do the copy, otherwise the io has to be split up into many
+ * jobs.
+ */
+static void dispatch_job(struct kcopyd_job *job)
+{
+ struct dm_kcopyd_client *kc = job->kc;
+ atomic_inc(&kc->nr_jobs);
+ if (unlikely(!job->source.count))
+ push(&kc->complete_jobs, job);
+ else if (job->pages == &zero_page_list)
+ push(&kc->io_jobs, job);
+ else
+ push(&kc->pages_jobs, job);
+ wake(kc);
+}
+
+static void segment_complete(int read_err, unsigned long write_err,
+ void *context)
+{
+ /* FIXME: tidy this function */
+ sector_t progress = 0;
+ sector_t count = 0;
+ struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
+ struct kcopyd_job *job = sub_job->master_job;
+ struct dm_kcopyd_client *kc = job->kc;
+
+ mutex_lock(&job->lock);
+
+ /* update the error */
+ if (read_err)
+ job->read_err = 1;
+
+ if (write_err)
+ job->write_err |= write_err;
+
+ /*
+ * Only dispatch more work if there hasn't been an error.
+ */
+ if ((!job->read_err && !job->write_err) ||
+ test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
+ /* get the next chunk of work */
+ progress = job->progress;
+ count = job->source.count - progress;
+ if (count) {
+ if (count > SUB_JOB_SIZE)
+ count = SUB_JOB_SIZE;
+
+ job->progress += count;
+ }
+ }
+ mutex_unlock(&job->lock);
+
+ if (count) {
+ int i;
+
+ *sub_job = *job;
+ sub_job->source.sector += progress;
+ sub_job->source.count = count;
+
+ for (i = 0; i < job->num_dests; i++) {
+ sub_job->dests[i].sector += progress;
+ sub_job->dests[i].count = count;
+ }
+
+ sub_job->fn = segment_complete;
+ sub_job->context = sub_job;
+ dispatch_job(sub_job);
+
+ } else if (atomic_dec_and_test(&job->sub_jobs)) {
+
+ /*
+ * Queue the completion callback to the kcopyd thread.
+ *
+ * Some callers assume that all the completions are called
+ * from a single thread and don't race with each other.
+ *
+ * We must not call the callback directly here because this
+ * code may not be executing in the thread.
+ */
+ push(&kc->complete_jobs, job);
+ wake(kc);
+ }
+}
+
+/*
+ * Create some sub jobs to share the work between them.
+ */
+static void split_job(struct kcopyd_job *master_job)
+{
+ int i;
+
+ atomic_inc(&master_job->kc->nr_jobs);
+
+ atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
+ for (i = 0; i < SPLIT_COUNT; i++) {
+ master_job[i + 1].master_job = master_job;
+ segment_complete(0, 0u, &master_job[i + 1]);
+ }
+}
+
+int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
+ unsigned int num_dests, struct dm_io_region *dests,
+ unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
+{
+ struct kcopyd_job *job;
+ int i;
+
+ /*
+ * Allocate an array of jobs consisting of one master job
+ * followed by SPLIT_COUNT sub jobs.
+ */
+ job = mempool_alloc(kc->job_pool, GFP_NOIO);
+
+ /*
+ * set up for the read.
+ */
+ job->kc = kc;
+ job->flags = flags;
+ job->read_err = 0;
+ job->write_err = 0;
+
+ job->num_dests = num_dests;
+ memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
+
+ if (from) {
+ job->source = *from;
+ job->pages = NULL;
+ job->rw = READ;
+ } else {
+ memset(&job->source, 0, sizeof job->source);
+ job->source.count = job->dests[0].count;
+ job->pages = &zero_page_list;
+
+ /*
+ * Use WRITE SAME to optimize zeroing if all dests support it.
+ */
+ job->rw = WRITE | REQ_WRITE_SAME;
+ for (i = 0; i < job->num_dests; i++)
+ if (!bdev_write_same(job->dests[i].bdev)) {
+ job->rw = WRITE;
+ break;
+ }
+ }
+
+ job->fn = fn;
+ job->context = context;
+ job->master_job = job;
+
+ if (job->source.count <= SUB_JOB_SIZE)
+ dispatch_job(job);
+ else {
+ mutex_init(&job->lock);
+ job->progress = 0;
+ split_job(job);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(dm_kcopyd_copy);
+
+int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
+ unsigned num_dests, struct dm_io_region *dests,
+ unsigned flags, dm_kcopyd_notify_fn fn, void *context)
+{
+ return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
+}
+EXPORT_SYMBOL(dm_kcopyd_zero);
+
+void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
+ dm_kcopyd_notify_fn fn, void *context)
+{
+ struct kcopyd_job *job;
+
+ job = mempool_alloc(kc->job_pool, GFP_NOIO);
+
+ memset(job, 0, sizeof(struct kcopyd_job));
+ job->kc = kc;
+ job->fn = fn;
+ job->context = context;
+ job->master_job = job;
+
+ atomic_inc(&kc->nr_jobs);
+
+ return job;
+}
+EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
+
+void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
+{
+ struct kcopyd_job *job = j;
+ struct dm_kcopyd_client *kc = job->kc;
+
+ job->read_err = read_err;
+ job->write_err = write_err;
+
+ push(&kc->complete_jobs, job);
+ wake(kc);
+}
+EXPORT_SYMBOL(dm_kcopyd_do_callback);
+
+/*
+ * Cancels a kcopyd job, eg. someone might be deactivating a
+ * mirror.
+ */
+#if 0
+int kcopyd_cancel(struct kcopyd_job *job, int block)
+{
+ /* FIXME: finish */
+ return -1;
+}
+#endif /* 0 */
+
+/*-----------------------------------------------------------------
+ * Client setup
+ *---------------------------------------------------------------*/
+struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
+{
+ int r = -ENOMEM;
+ struct dm_kcopyd_client *kc;
+
+ kc = kmalloc(sizeof(*kc), GFP_KERNEL);
+ if (!kc)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&kc->job_lock);
+ INIT_LIST_HEAD(&kc->complete_jobs);
+ INIT_LIST_HEAD(&kc->io_jobs);
+ INIT_LIST_HEAD(&kc->pages_jobs);
+ kc->throttle = throttle;
+
+ kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
+ if (!kc->job_pool)
+ goto bad_slab;
+
+ INIT_WORK(&kc->kcopyd_work, do_work);
+ kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
+ if (!kc->kcopyd_wq)
+ goto bad_workqueue;
+
+ kc->pages = NULL;
+ kc->nr_reserved_pages = kc->nr_free_pages = 0;
+ r = client_reserve_pages(kc, RESERVE_PAGES);
+ if (r)
+ goto bad_client_pages;
+
+ kc->io_client = dm_io_client_create();
+ if (IS_ERR(kc->io_client)) {
+ r = PTR_ERR(kc->io_client);
+ goto bad_io_client;
+ }
+
+ init_waitqueue_head(&kc->destroyq);
+ atomic_set(&kc->nr_jobs, 0);
+
+ return kc;
+
+bad_io_client:
+ client_free_pages(kc);
+bad_client_pages:
+ destroy_workqueue(kc->kcopyd_wq);
+bad_workqueue:
+ mempool_destroy(kc->job_pool);
+bad_slab:
+ kfree(kc);
+
+ return ERR_PTR(r);
+}
+EXPORT_SYMBOL(dm_kcopyd_client_create);
+
+void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
+{
+ /* Wait for completion of all jobs submitted by this client. */
+ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
+
+ BUG_ON(!list_empty(&kc->complete_jobs));
+ BUG_ON(!list_empty(&kc->io_jobs));
+ BUG_ON(!list_empty(&kc->pages_jobs));
+ destroy_workqueue(kc->kcopyd_wq);
+ dm_io_client_destroy(kc->io_client);
+ client_free_pages(kc);
+ mempool_destroy(kc->job_pool);
+ kfree(kc);
+}
+EXPORT_SYMBOL(dm_kcopyd_client_destroy);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
new file mode 100644
index 000000000..53e848c10
--- /dev/null
+++ b/drivers/md/dm-linear.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "linear"
+
+/*
+ * Linear: maps a linear range of a device.
+ */
+struct linear_c {
+ struct dm_dev *dev;
+ sector_t start;
+};
+
+/*
+ * Construct a linear mapping: <dev_path> <offset>
+ */
+static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct linear_c *lc;
+ unsigned long long tmp;
+ char dummy;
+
+ if (argc != 2) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+ if (lc == NULL) {
+ ti->error = "dm-linear: Cannot allocate linear context";
+ return -ENOMEM;
+ }
+
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
+ ti->error = "dm-linear: Invalid device sector";
+ goto bad;
+ }
+ lc->start = tmp;
+
+ if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) {
+ ti->error = "dm-linear: Device lookup failed";
+ goto bad;
+ }
+
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
+ ti->num_write_same_bios = 1;
+ ti->private = lc;
+ return 0;
+
+ bad:
+ kfree(lc);
+ return -EINVAL;
+}
+
+static void linear_dtr(struct dm_target *ti)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+
+ dm_put_device(ti, lc->dev);
+ kfree(lc);
+}
+
+static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
+{
+ struct linear_c *lc = ti->private;
+
+ return lc->start + dm_target_offset(ti, bi_sector);
+}
+
+static void linear_map_bio(struct dm_target *ti, struct bio *bio)
+{
+ struct linear_c *lc = ti->private;
+
+ bio->bi_bdev = lc->dev->bdev;
+ if (bio_sectors(bio))
+ bio->bi_iter.bi_sector =
+ linear_map_sector(ti, bio->bi_iter.bi_sector);
+}
+
+static int linear_map(struct dm_target *ti, struct bio *bio)
+{
+ linear_map_bio(ti, bio);
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static void linear_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ snprintf(result, maxlen, "%s %llu", lc->dev->name,
+ (unsigned long long)lc->start);
+ break;
+ }
+}
+
+static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
+ unsigned long arg)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+ struct dm_dev *dev = lc->dev;
+ int r = 0;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (lc->start ||
+ ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+}
+
+static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct linear_c *lc = ti->private;
+ struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = lc->dev->bdev;
+ bvm->bi_sector = linear_map_sector(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int linear_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct linear_c *lc = ti->private;
+
+ return fn(ti, lc->dev, lc->start, ti->len, data);
+}
+
+static struct target_type linear_target = {
+ .name = "linear",
+ .version = {1, 2, 1},
+ .module = THIS_MODULE,
+ .ctr = linear_ctr,
+ .dtr = linear_dtr,
+ .map = linear_map,
+ .status = linear_status,
+ .ioctl = linear_ioctl,
+ .merge = linear_merge,
+ .iterate_devices = linear_iterate_devices,
+};
+
+int __init dm_linear_init(void)
+{
+ int r = dm_register_target(&linear_target);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+void dm_linear_exit(void)
+{
+ dm_unregister_target(&linear_target);
+}
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000..058256d2e
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,936 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+
+#include "dm-log-userspace-transfer.h"
+
+#define DM_LOG_USERSPACE_VSN "1.3.0"
+
+#define FLUSH_ENTRY_POOL_SIZE 16
+
+struct dm_dirty_log_flush_entry {
+ int type;
+ region_t region;
+ struct list_head list;
+};
+
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary. However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
+
+struct log_c {
+ struct dm_target *ti;
+ struct dm_dev *log_dev;
+
+ char *usr_argv_str;
+ uint32_t usr_argc;
+
+ uint32_t region_size;
+ region_t region_count;
+ uint64_t luid;
+ char uuid[DM_UUID_LEN];
+
+ /*
+ * Mark and clear requests are held until a flush is issued
+ * so that we can group, and thereby limit, the amount of
+ * network traffic between kernel and userspace. The 'flush_lock'
+ * is used to protect these lists.
+ */
+ spinlock_t flush_lock;
+ struct list_head mark_list;
+ struct list_head clear_list;
+
+ /*
+ * in_sync_hint gets set when doing is_remote_recovering. It
+ * represents the first region that needs recovery. IOW, the
+ * first zero bit of sync_bits. This can be useful for to limit
+ * traffic for calls like is_remote_recovering and get_resync_work,
+ * but be take care in its use for anything else.
+ */
+ uint64_t in_sync_hint;
+
+ /*
+ * Workqueue for flush of clear region requests.
+ */
+ struct workqueue_struct *dmlog_wq;
+ struct delayed_work flush_log_work;
+ atomic_t sched_flush;
+
+ /*
+ * Combine userspace flush and mark requests for efficiency.
+ */
+ uint32_t integrated_flush;
+
+ mempool_t *flush_entry_pool;
+};
+
+static struct kmem_cache *_flush_entry_cache;
+
+static int userspace_do_request(struct log_c *lc, const char *uuid,
+ int request_type, char *data, size_t data_size,
+ char *rdata, size_t *rdata_size)
+{
+ int r;
+
+ /*
+ * If the server isn't there, -ESRCH is returned,
+ * and we must keep trying until the server is
+ * restored.
+ */
+retry:
+ r = dm_consult_userspace(uuid, lc->luid, request_type, data,
+ data_size, rdata, rdata_size);
+
+ if (r != -ESRCH)
+ return r;
+
+ DMERR(" Userspace log server not found.");
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(2*HZ);
+ DMWARN("Attempting to contact userspace log server...");
+ r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR,
+ lc->usr_argv_str,
+ strlen(lc->usr_argv_str) + 1,
+ NULL, NULL);
+ if (!r)
+ break;
+ }
+ DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
+ r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL,
+ 0, NULL, NULL);
+ if (!r)
+ goto retry;
+
+ DMERR("Error trying to resume userspace log: %d", r);
+
+ return -ESRCH;
+}
+
+static int build_constructor_string(struct dm_target *ti,
+ unsigned argc, char **argv,
+ char **ctr_str)
+{
+ int i, str_size;
+ char *str = NULL;
+
+ *ctr_str = NULL;
+
+ /*
+ * Determine overall size of the string.
+ */
+ for (i = 0, str_size = 0; i < argc; i++)
+ str_size += strlen(argv[i]) + 1; /* +1 for space between args */
+
+ str_size += 20; /* Max number of chars in a printed u64 number */
+
+ str = kzalloc(str_size, GFP_KERNEL);
+ if (!str) {
+ DMWARN("Unable to allocate memory for constructor string");
+ return -ENOMEM;
+ }
+
+ str_size = sprintf(str, "%llu", (unsigned long long)ti->len);
+ for (i = 0; i < argc; i++)
+ str_size += sprintf(str + str_size, " %s", argv[i]);
+
+ *ctr_str = str;
+ return str_size;
+}
+
+static void do_flush(struct work_struct *work)
+{
+ int r;
+ struct log_c *lc = container_of(work, struct log_c, flush_log_work.work);
+
+ atomic_set(&lc->sched_flush, 0);
+
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL);
+
+ if (r)
+ dm_table_event(lc->ti->table);
+}
+
+/*
+ * userspace_ctr
+ *
+ * argv contains:
+ * <UUID> [integrated_flush] <other args>
+ * Where 'other args' are the userspace implementation-specific log
+ * arguments.
+ *
+ * Example:
+ * <UUID> [integrated_flush] clustered-disk <arg count> <log dev>
+ * <region_size> [[no]sync]
+ *
+ * This module strips off the <UUID> and uses it for identification
+ * purposes when communicating with userspace about a log.
+ *
+ * If integrated_flush is defined, the kernel combines flush
+ * and mark requests.
+ *
+ * The rest of the line, beginning with 'clustered-disk', is passed
+ * to the userspace ctr function.
+ */
+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+ unsigned argc, char **argv)
+{
+ int r = 0;
+ int str_size;
+ char *ctr_str = NULL;
+ struct log_c *lc = NULL;
+ uint64_t rdata;
+ size_t rdata_size = sizeof(rdata);
+ char *devices_rdata = NULL;
+ size_t devices_rdata_size = DM_NAME_LEN;
+
+ if (argc < 3) {
+ DMWARN("Too few arguments to userspace dirty log");
+ return -EINVAL;
+ }
+
+ lc = kzalloc(sizeof(*lc), GFP_KERNEL);
+ if (!lc) {
+ DMWARN("Unable to allocate userspace log context.");
+ return -ENOMEM;
+ }
+
+ /* The ptr value is sufficient for local unique id */
+ lc->luid = (unsigned long)lc;
+
+ lc->ti = ti;
+
+ if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
+ DMWARN("UUID argument too long.");
+ kfree(lc);
+ return -EINVAL;
+ }
+
+ lc->usr_argc = argc;
+
+ strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+ argc--;
+ argv++;
+ spin_lock_init(&lc->flush_lock);
+ INIT_LIST_HEAD(&lc->mark_list);
+ INIT_LIST_HEAD(&lc->clear_list);
+
+ if (!strcasecmp(argv[0], "integrated_flush")) {
+ lc->integrated_flush = 1;
+ argc--;
+ argv++;
+ }
+
+ str_size = build_constructor_string(ti, argc, argv, &ctr_str);
+ if (str_size < 0) {
+ kfree(lc);
+ return str_size;
+ }
+
+ devices_rdata = kzalloc(devices_rdata_size, GFP_KERNEL);
+ if (!devices_rdata) {
+ DMERR("Failed to allocate memory for device information");
+ r = -ENOMEM;
+ goto out;
+ }
+
+ lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
+ _flush_entry_cache);
+ if (!lc->flush_entry_pool) {
+ DMERR("Failed to create flush_entry_pool");
+ r = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Send table string and get back any opened device.
+ */
+ r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
+ ctr_str, str_size,
+ devices_rdata, &devices_rdata_size);
+
+ if (r < 0) {
+ if (r == -ESRCH)
+ DMERR("Userspace log server not found");
+ else
+ DMERR("Userspace log server failed to create log");
+ goto out;
+ }
+
+ /* Since the region size does not change, get it now */
+ rdata_size = sizeof(rdata);
+ r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE,
+ NULL, 0, (char *)&rdata, &rdata_size);
+
+ if (r) {
+ DMERR("Failed to get region size of dirty log");
+ goto out;
+ }
+
+ lc->region_size = (uint32_t)rdata;
+ lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
+
+ if (devices_rdata_size) {
+ if (devices_rdata[devices_rdata_size - 1] != '\0') {
+ DMERR("DM_ULOG_CTR device return string not properly terminated");
+ r = -EINVAL;
+ goto out;
+ }
+ r = dm_get_device(ti, devices_rdata,
+ dm_table_get_mode(ti->table), &lc->log_dev);
+ if (r)
+ DMERR("Failed to register %s with device-mapper",
+ devices_rdata);
+ }
+
+ if (lc->integrated_flush) {
+ lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0);
+ if (!lc->dmlog_wq) {
+ DMERR("couldn't start dmlogd");
+ r = -ENOMEM;
+ goto out;
+ }
+
+ INIT_DELAYED_WORK(&lc->flush_log_work, do_flush);
+ atomic_set(&lc->sched_flush, 0);
+ }
+
+out:
+ kfree(devices_rdata);
+ if (r) {
+ if (lc->flush_entry_pool)
+ mempool_destroy(lc->flush_entry_pool);
+ kfree(lc);
+ kfree(ctr_str);
+ } else {
+ lc->usr_argv_str = ctr_str;
+ log->context = lc;
+ }
+
+ return r;
+}
+
+static void userspace_dtr(struct dm_dirty_log *log)
+{
+ struct log_c *lc = log->context;
+
+ if (lc->integrated_flush) {
+ /* flush workqueue */
+ if (atomic_read(&lc->sched_flush))
+ flush_delayed_work(&lc->flush_log_work);
+
+ destroy_workqueue(lc->dmlog_wq);
+ }
+
+ (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+ NULL, 0, NULL, NULL);
+
+ if (lc->log_dev)
+ dm_put_device(lc->ti, lc->log_dev);
+
+ mempool_destroy(lc->flush_entry_pool);
+
+ kfree(lc->usr_argv_str);
+ kfree(lc);
+
+ return;
+}
+
+static int userspace_presuspend(struct dm_dirty_log *log)
+{
+ int r;
+ struct log_c *lc = log->context;
+
+ r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
+ NULL, 0, NULL, NULL);
+
+ return r;
+}
+
+static int userspace_postsuspend(struct dm_dirty_log *log)
+{
+ int r;
+ struct log_c *lc = log->context;
+
+ /*
+ * Run planned flush earlier.
+ */
+ if (lc->integrated_flush && atomic_read(&lc->sched_flush))
+ flush_delayed_work(&lc->flush_log_work);
+
+ r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
+ NULL, 0, NULL, NULL);
+
+ return r;
+}
+
+static int userspace_resume(struct dm_dirty_log *log)
+{
+ int r;
+ struct log_c *lc = log->context;
+
+ lc->in_sync_hint = 0;
+ r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
+ NULL, 0, NULL, NULL);
+
+ return r;
+}
+
+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
+{
+ struct log_c *lc = log->context;
+
+ return lc->region_size;
+}
+
+/*
+ * userspace_is_clean
+ *
+ * Check whether a region is clean. If there is any sort of
+ * failure when consulting the server, we return not clean.
+ *
+ * Returns: 1 if clean, 0 otherwise
+ */
+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
+{
+ int r;
+ uint64_t region64 = (uint64_t)region;
+ int64_t is_clean;
+ size_t rdata_size;
+ struct log_c *lc = log->context;
+
+ rdata_size = sizeof(is_clean);
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
+ (char *)&region64, sizeof(region64),
+ (char *)&is_clean, &rdata_size);
+
+ return (r) ? 0 : (int)is_clean;
+}
+
+/*
+ * userspace_in_sync
+ *
+ * Check if the region is in-sync. If there is any sort
+ * of failure when consulting the server, we assume that
+ * the region is not in sync.
+ *
+ * If 'can_block' is set, return immediately
+ *
+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
+ */
+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
+ int can_block)
+{
+ int r;
+ uint64_t region64 = region;
+ int64_t in_sync;
+ size_t rdata_size;
+ struct log_c *lc = log->context;
+
+ /*
+ * We can never respond directly - even if in_sync_hint is
+ * set. This is because another machine could see a device
+ * failure and mark the region out-of-sync. If we don't go
+ * to userspace to ask, we might think the region is in-sync
+ * and allow a read to pick up data that is stale. (This is
+ * very unlikely if a device actually fails; but it is very
+ * likely if a connection to one device from one machine fails.)
+ *
+ * There still might be a problem if the mirror caches the region
+ * state as in-sync... but then this call would not be made. So,
+ * that is a mirror problem.
+ */
+ if (!can_block)
+ return -EWOULDBLOCK;
+
+ rdata_size = sizeof(in_sync);
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
+ (char *)&region64, sizeof(region64),
+ (char *)&in_sync, &rdata_size);
+ return (r) ? 0 : (int)in_sync;
+}
+
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+ int r = 0;
+ struct dm_dirty_log_flush_entry *fe;
+
+ list_for_each_entry(fe, flush_list, list) {
+ r = userspace_do_request(lc, lc->uuid, fe->type,
+ (char *)&fe->region,
+ sizeof(fe->region),
+ NULL, NULL);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
+ int flush_with_payload)
+{
+ int r = 0;
+ int count;
+ uint32_t type = 0;
+ struct dm_dirty_log_flush_entry *fe, *tmp_fe;
+ LIST_HEAD(tmp_list);
+ uint64_t group[MAX_FLUSH_GROUP_COUNT];
+
+ /*
+ * Group process the requests
+ */
+ while (!list_empty(flush_list)) {
+ count = 0;
+
+ list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+ group[count] = fe->region;
+ count++;
+
+ list_move(&fe->list, &tmp_list);
+
+ type = fe->type;
+ if (count >= MAX_FLUSH_GROUP_COUNT)
+ break;
+ }
+
+ if (flush_with_payload) {
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+ (char *)(group),
+ count * sizeof(uint64_t),
+ NULL, NULL);
+ /*
+ * Integrated flush failed.
+ */
+ if (r)
+ break;
+ } else {
+ r = userspace_do_request(lc, lc->uuid, type,
+ (char *)(group),
+ count * sizeof(uint64_t),
+ NULL, NULL);
+ if (r) {
+ /*
+ * Group send failed. Attempt one-by-one.
+ */
+ list_splice_init(&tmp_list, flush_list);
+ r = flush_one_by_one(lc, flush_list);
+ break;
+ }
+ }
+ }
+
+ /*
+ * Must collect flush_entrys that were successfully processed
+ * as a group so that they will be free'd by the caller.
+ */
+ list_splice_init(&tmp_list, flush_list);
+
+ return r;
+}
+
+/*
+ * userspace_flush
+ *
+ * This function is ok to block.
+ * The flush happens in two stages. First, it sends all
+ * clear/mark requests that are on the list. Then it
+ * tells the server to commit them. This gives the
+ * server a chance to optimise the commit, instead of
+ * doing it for every request.
+ *
+ * Additionally, we could implement another thread that
+ * sends the requests up to the server - reducing the
+ * load on flush. Then the flush would have less in
+ * the list and be responsible for the finishing commit.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int userspace_flush(struct dm_dirty_log *log)
+{
+ int r = 0;
+ unsigned long flags;
+ struct log_c *lc = log->context;
+ LIST_HEAD(mark_list);
+ LIST_HEAD(clear_list);
+ int mark_list_is_empty;
+ int clear_list_is_empty;
+ struct dm_dirty_log_flush_entry *fe, *tmp_fe;
+ mempool_t *flush_entry_pool = lc->flush_entry_pool;
+
+ spin_lock_irqsave(&lc->flush_lock, flags);
+ list_splice_init(&lc->mark_list, &mark_list);
+ list_splice_init(&lc->clear_list, &clear_list);
+ spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+ mark_list_is_empty = list_empty(&mark_list);
+ clear_list_is_empty = list_empty(&clear_list);
+
+ if (mark_list_is_empty && clear_list_is_empty)
+ return 0;
+
+ r = flush_by_group(lc, &clear_list, 0);
+ if (r)
+ goto out;
+
+ if (!lc->integrated_flush) {
+ r = flush_by_group(lc, &mark_list, 0);
+ if (r)
+ goto out;
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+ NULL, 0, NULL, NULL);
+ goto out;
+ }
+
+ /*
+ * Send integrated flush request with mark_list as payload.
+ */
+ r = flush_by_group(lc, &mark_list, 1);
+ if (r)
+ goto out;
+
+ if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) {
+ /*
+ * When there are only clear region requests,
+ * we schedule a flush in the future.
+ */
+ queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ);
+ atomic_set(&lc->sched_flush, 1);
+ } else {
+ /*
+ * Cancel pending flush because we
+ * have already flushed in mark_region.
+ */
+ cancel_delayed_work(&lc->flush_log_work);
+ atomic_set(&lc->sched_flush, 0);
+ }
+
+out:
+ /*
+ * We can safely remove these entries, even after failure.
+ * Calling code will receive an error and will know that
+ * the log facility has failed.
+ */
+ list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+ list_del(&fe->list);
+ mempool_free(fe, flush_entry_pool);
+ }
+ list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
+ list_del(&fe->list);
+ mempool_free(fe, flush_entry_pool);
+ }
+
+ if (r)
+ dm_table_event(lc->ti->table);
+
+ return r;
+}
+
+/*
+ * userspace_mark_region
+ *
+ * This function should avoid blocking unless absolutely required.
+ * (Memory allocation is valid for blocking.)
+ */
+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
+{
+ unsigned long flags;
+ struct log_c *lc = log->context;
+ struct dm_dirty_log_flush_entry *fe;
+
+ /* Wait for an allocation, but _never_ fail */
+ fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
+ BUG_ON(!fe);
+
+ spin_lock_irqsave(&lc->flush_lock, flags);
+ fe->type = DM_ULOG_MARK_REGION;
+ fe->region = region;
+ list_add(&fe->list, &lc->mark_list);
+ spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+ return;
+}
+
+/*
+ * userspace_clear_region
+ *
+ * This function must not block.
+ * So, the alloc can't block. In the worst case, it is ok to
+ * fail. It would simply mean we can't clear the region.
+ * Does nothing to current sync context, but does mean
+ * the region will be re-sync'ed on a reload of the mirror
+ * even though it is in-sync.
+ */
+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
+{
+ unsigned long flags;
+ struct log_c *lc = log->context;
+ struct dm_dirty_log_flush_entry *fe;
+
+ /*
+ * If we fail to allocate, we skip the clearing of
+ * the region. This doesn't hurt us in any way, except
+ * to cause the region to be resync'ed when the
+ * device is activated next time.
+ */
+ fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
+ if (!fe) {
+ DMERR("Failed to allocate memory to clear region.");
+ return;
+ }
+
+ spin_lock_irqsave(&lc->flush_lock, flags);
+ fe->type = DM_ULOG_CLEAR_REGION;
+ fe->region = region;
+ list_add(&fe->list, &lc->clear_list);
+ spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+ return;
+}
+
+/*
+ * userspace_get_resync_work
+ *
+ * Get a region that needs recovery. It is valid to return
+ * an error for this function.
+ *
+ * Returns: 1 if region filled, 0 if no work, <0 on error
+ */
+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+ int r;
+ size_t rdata_size;
+ struct log_c *lc = log->context;
+ struct {
+ int64_t i; /* 64-bit for mix arch compatibility */
+ region_t r;
+ } pkg;
+
+ if (lc->in_sync_hint >= lc->region_count)
+ return 0;
+
+ rdata_size = sizeof(pkg);
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
+ NULL, 0, (char *)&pkg, &rdata_size);
+
+ *region = pkg.r;
+ return (r) ? r : (int)pkg.i;
+}
+
+/*
+ * userspace_set_region_sync
+ *
+ * Set the sync status of a given region. This function
+ * must not fail.
+ */
+static void userspace_set_region_sync(struct dm_dirty_log *log,
+ region_t region, int in_sync)
+{
+ struct log_c *lc = log->context;
+ struct {
+ region_t r;
+ int64_t i;
+ } pkg;
+
+ pkg.r = region;
+ pkg.i = (int64_t)in_sync;
+
+ (void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+ (char *)&pkg, sizeof(pkg), NULL, NULL);
+
+ /*
+ * It would be nice to be able to report failures.
+ * However, it is easy enough to detect and resolve.
+ */
+ return;
+}
+
+/*
+ * userspace_get_sync_count
+ *
+ * If there is any sort of failure when consulting the server,
+ * we assume that the sync count is zero.
+ *
+ * Returns: sync count on success, 0 on failure
+ */
+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
+{
+ int r;
+ size_t rdata_size;
+ uint64_t sync_count;
+ struct log_c *lc = log->context;
+
+ rdata_size = sizeof(sync_count);
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
+ NULL, 0, (char *)&sync_count, &rdata_size);
+
+ if (r)
+ return 0;
+
+ if (sync_count >= lc->region_count)
+ lc->in_sync_hint = lc->region_count;
+
+ return (region_t)sync_count;
+}
+
+/*
+ * userspace_status
+ *
+ * Returns: amount of space consumed
+ */
+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
+ char *result, unsigned maxlen)
+{
+ int r = 0;
+ char *table_args;
+ size_t sz = (size_t)maxlen;
+ struct log_c *lc = log->context;
+
+ switch (status_type) {
+ case STATUSTYPE_INFO:
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
+ NULL, 0, result, &sz);
+
+ if (r) {
+ sz = 0;
+ DMEMIT("%s 1 COM_FAILURE", log->type->name);
+ }
+ break;
+ case STATUSTYPE_TABLE:
+ sz = 0;
+ table_args = strchr(lc->usr_argv_str, ' ');
+ BUG_ON(!table_args); /* There will always be a ' ' */
+ table_args++;
+
+ DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid);
+ if (lc->integrated_flush)
+ DMEMIT("integrated_flush ");
+ DMEMIT("%s ", table_args);
+ break;
+ }
+ return (r) ? 0 : (int)sz;
+}
+
+/*
+ * userspace_is_remote_recovering
+ *
+ * Returns: 1 if region recovering, 0 otherwise
+ */
+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
+ region_t region)
+{
+ int r;
+ uint64_t region64 = region;
+ struct log_c *lc = log->context;
+ static unsigned long limit;
+ struct {
+ int64_t is_recovering;
+ uint64_t in_sync_hint;
+ } pkg;
+ size_t rdata_size = sizeof(pkg);
+
+ /*
+ * Once the mirror has been reported to be in-sync,
+ * it will never again ask for recovery work. So,
+ * we can safely say there is not a remote machine
+ * recovering if the device is in-sync. (in_sync_hint
+ * must be reset at resume time.)
+ */
+ if (region < lc->in_sync_hint)
+ return 0;
+ else if (time_after(limit, jiffies))
+ return 1;
+
+ limit = jiffies + (HZ / 4);
+ r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
+ (char *)&region64, sizeof(region64),
+ (char *)&pkg, &rdata_size);
+ if (r)
+ return 1;
+
+ lc->in_sync_hint = pkg.in_sync_hint;
+
+ return (int)pkg.is_recovering;
+}
+
+static struct dm_dirty_log_type _userspace_type = {
+ .name = "userspace",
+ .module = THIS_MODULE,
+ .ctr = userspace_ctr,
+ .dtr = userspace_dtr,
+ .presuspend = userspace_presuspend,
+ .postsuspend = userspace_postsuspend,
+ .resume = userspace_resume,
+ .get_region_size = userspace_get_region_size,
+ .is_clean = userspace_is_clean,
+ .in_sync = userspace_in_sync,
+ .flush = userspace_flush,
+ .mark_region = userspace_mark_region,
+ .clear_region = userspace_clear_region,
+ .get_resync_work = userspace_get_resync_work,
+ .set_region_sync = userspace_set_region_sync,
+ .get_sync_count = userspace_get_sync_count,
+ .status = userspace_status,
+ .is_remote_recovering = userspace_is_remote_recovering,
+};
+
+static int __init userspace_dirty_log_init(void)
+{
+ int r = 0;
+
+ _flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0);
+ if (!_flush_entry_cache) {
+ DMWARN("Unable to create flush_entry_cache: No memory.");
+ return -ENOMEM;
+ }
+
+ r = dm_ulog_tfr_init();
+ if (r) {
+ DMWARN("Unable to initialize userspace log communications");
+ kmem_cache_destroy(_flush_entry_cache);
+ return r;
+ }
+
+ r = dm_dirty_log_type_register(&_userspace_type);
+ if (r) {
+ DMWARN("Couldn't register userspace dirty log type");
+ dm_ulog_tfr_exit();
+ kmem_cache_destroy(_flush_entry_cache);
+ return r;
+ }
+
+ DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
+ return 0;
+}
+
+static void __exit userspace_dirty_log_exit(void)
+{
+ dm_dirty_log_type_unregister(&_userspace_type);
+ dm_ulog_tfr_exit();
+ kmem_cache_destroy(_flush_entry_cache);
+
+ DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
+ return;
+}
+
+module_init(userspace_dirty_log_init);
+module_exit(userspace_dirty_log_exit);
+
+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
new file mode 100644
index 000000000..fdf8ec304
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/workqueue.h>
+#include <linux/connector.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+static uint32_t dm_ulog_seq;
+
+/*
+ * Netlink/Connector is an unreliable protocol. How long should
+ * we wait for a response before assuming it was lost and retrying?
+ * (If we do receive a response after this time, it will be discarded
+ * and the response to the resent request will be waited for.
+ */
+#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
+
+/*
+ * Pre-allocated space for speed
+ */
+#define DM_ULOG_PREALLOCED_SIZE 512
+static struct cn_msg *prealloced_cn_msg;
+static struct dm_ulog_request *prealloced_ulog_tfr;
+
+static struct cb_id ulog_cn_id = {
+ .idx = CN_IDX_DM,
+ .val = CN_VAL_DM_USERSPACE_LOG
+};
+
+static DEFINE_MUTEX(dm_ulog_lock);
+
+struct receiving_pkg {
+ struct list_head list;
+ struct completion complete;
+
+ uint32_t seq;
+
+ int error;
+ size_t *data_size;
+ char *data;
+};
+
+static DEFINE_SPINLOCK(receiving_list_lock);
+static struct list_head receiving_list;
+
+static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
+{
+ int r;
+ struct cn_msg *msg = prealloced_cn_msg;
+
+ memset(msg, 0, sizeof(struct cn_msg));
+
+ msg->id.idx = ulog_cn_id.idx;
+ msg->id.val = ulog_cn_id.val;
+ msg->ack = 0;
+ msg->seq = tfr->seq;
+ msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
+
+ r = cn_netlink_send(msg, 0, 0, gfp_any());
+
+ return r;
+}
+
+/*
+ * Parameters for this function can be either msg or tfr, but not
+ * both. This function fills in the reply for a waiting request.
+ * If just msg is given, then the reply is simply an ACK from userspace
+ * that the request was received.
+ *
+ * Returns: 0 on success, -ENOENT on failure
+ */
+static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
+{
+ uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
+ struct receiving_pkg *pkg;
+
+ /*
+ * The 'receiving_pkg' entries in this list are statically
+ * allocated on the stack in 'dm_consult_userspace'.
+ * Each process that is waiting for a reply from the user
+ * space server will have an entry in this list.
+ *
+ * We are safe to do it this way because the stack space
+ * is unique to each process, but still addressable by
+ * other processes.
+ */
+ list_for_each_entry(pkg, &receiving_list, list) {
+ if (rtn_seq != pkg->seq)
+ continue;
+
+ if (msg) {
+ pkg->error = -msg->ack;
+ /*
+ * If we are trying again, we will need to know our
+ * storage capacity. Otherwise, along with the
+ * error code, we make explicit that we have no data.
+ */
+ if (pkg->error != -EAGAIN)
+ *(pkg->data_size) = 0;
+ } else if (tfr->data_size > *(pkg->data_size)) {
+ DMERR("Insufficient space to receive package [%u] "
+ "(%u vs %zu)", tfr->request_type,
+ tfr->data_size, *(pkg->data_size));
+
+ *(pkg->data_size) = 0;
+ pkg->error = -ENOSPC;
+ } else {
+ pkg->error = tfr->error;
+ memcpy(pkg->data, tfr->data, tfr->data_size);
+ *(pkg->data_size) = tfr->data_size;
+ }
+ complete(&pkg->complete);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/*
+ * This is the connector callback that delivers data
+ * that was sent from userspace.
+ */
+static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+{
+ struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return;
+
+ spin_lock(&receiving_list_lock);
+ if (msg->len == 0)
+ fill_pkg(msg, NULL);
+ else if (msg->len < sizeof(*tfr))
+ DMERR("Incomplete message received (expected %u, got %u): [%u]",
+ (unsigned)sizeof(*tfr), msg->len, msg->seq);
+ else
+ fill_pkg(NULL, tfr);
+ spin_unlock(&receiving_list_lock);
+}
+
+/**
+ * dm_consult_userspace
+ * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size)
+ * @luid: log's local unique identifier
+ * @request_type: found in include/linux/dm-log-userspace.h
+ * @data: data to tx to the server
+ * @data_size: size of data in bytes
+ * @rdata: place to put return data from server
+ * @rdata_size: value-result (amount of space given/amount of space used)
+ *
+ * rdata_size is undefined on failure.
+ *
+ * Memory used to communicate with userspace is zero'ed
+ * before populating to ensure that no unwanted bits leak
+ * from kernel space to user-space. All userspace log communications
+ * between kernel and user space go through this function.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ **/
+int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
+ char *data, size_t data_size,
+ char *rdata, size_t *rdata_size)
+{
+ int r = 0;
+ unsigned long tmo;
+ size_t dummy = 0;
+ int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
+ struct dm_ulog_request *tfr = prealloced_ulog_tfr;
+ struct receiving_pkg pkg;
+
+ /*
+ * Given the space needed to hold the 'struct cn_msg' and
+ * 'struct dm_ulog_request' - do we have enough payload
+ * space remaining?
+ */
+ if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
+ DMINFO("Size of tfr exceeds preallocated size");
+ return -EINVAL;
+ }
+
+ if (!rdata_size)
+ rdata_size = &dummy;
+resend:
+ /*
+ * We serialize the sending of requests so we can
+ * use the preallocated space.
+ */
+ mutex_lock(&dm_ulog_lock);
+
+ memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
+ memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+ tfr->version = DM_ULOG_REQUEST_VERSION;
+ tfr->luid = luid;
+ tfr->seq = dm_ulog_seq++;
+
+ /*
+ * Must be valid request type (all other bits set to
+ * zero). This reserves other bits for possible future
+ * use.
+ */
+ tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
+
+ tfr->data_size = data_size;
+ if (data && data_size)
+ memcpy(tfr->data, data, data_size);
+
+ memset(&pkg, 0, sizeof(pkg));
+ init_completion(&pkg.complete);
+ pkg.seq = tfr->seq;
+ pkg.data_size = rdata_size;
+ pkg.data = rdata;
+ spin_lock(&receiving_list_lock);
+ list_add(&(pkg.list), &receiving_list);
+ spin_unlock(&receiving_list_lock);
+
+ r = dm_ulog_sendto_server(tfr);
+
+ mutex_unlock(&dm_ulog_lock);
+
+ if (r) {
+ DMERR("Unable to send log request [%u] to userspace: %d",
+ request_type, r);
+ spin_lock(&receiving_list_lock);
+ list_del_init(&(pkg.list));
+ spin_unlock(&receiving_list_lock);
+
+ goto out;
+ }
+
+ tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+ spin_lock(&receiving_list_lock);
+ list_del_init(&(pkg.list));
+ spin_unlock(&receiving_list_lock);
+ if (!tmo) {
+ DMWARN("[%s] Request timed out: [%u/%u] - retrying",
+ (strlen(uuid) > 8) ?
+ (uuid + (strlen(uuid) - 8)) : (uuid),
+ request_type, pkg.seq);
+ goto resend;
+ }
+
+ r = pkg.error;
+ if (r == -EAGAIN)
+ goto resend;
+
+out:
+ return r;
+}
+
+int dm_ulog_tfr_init(void)
+{
+ int r;
+ void *prealloced;
+
+ INIT_LIST_HEAD(&receiving_list);
+
+ prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
+ if (!prealloced)
+ return -ENOMEM;
+
+ prealloced_cn_msg = prealloced;
+ prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
+
+ r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
+ if (r) {
+ kfree(prealloced_cn_msg);
+ return r;
+ }
+
+ return 0;
+}
+
+void dm_ulog_tfr_exit(void)
+{
+ cn_del_callback(&ulog_cn_id);
+ kfree(prealloced_cn_msg);
+}
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
new file mode 100644
index 000000000..04ee874f9
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
+#define __DM_LOG_USERSPACE_TRANSFER_H__
+
+#define DM_MSG_PREFIX "dm-log-userspace"
+
+int dm_ulog_tfr_init(void);
+void dm_ulog_tfr_exit(void);
+int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
+ char *data, size_t data_size,
+ char *rdata, size_t *rdata_size);
+
+#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
new file mode 100644
index 000000000..93e08446a
--- /dev/null
+++ b/drivers/md/dm-log-writes.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#define DM_MSG_PREFIX "log-writes"
+
+/*
+ * This target will sequentially log all writes to the target device onto the
+ * log device. This is helpful for replaying writes to check for fs consistency
+ * at all times. This target provides a mechanism to mark specific events to
+ * check data at a later time. So for example you would:
+ *
+ * write data
+ * fsync
+ * dmsetup message /dev/whatever mark mymark
+ * unmount /mnt/test
+ *
+ * Then replay the log up to mymark and check the contents of the replay to
+ * verify it matches what was written.
+ *
+ * We log writes only after they have been flushed, this makes the log describe
+ * close to the order in which the data hits the actual disk, not its cache. So
+ * for example the following sequence (W means write, C means complete)
+ *
+ * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
+ *
+ * Would result in the log looking like this:
+ *
+ * c,a,flush,fuad,b,<other writes>,<next flush>
+ *
+ * This is meant to help expose problems where file systems do not properly wait
+ * on data being written before invoking a FLUSH. FUA bypasses cache so once it
+ * completes it is added to the log as it should be on disk.
+ *
+ * We treat DISCARDs as if they don't bypass cache so that they are logged in
+ * order of completion along with the normal writes. If we didn't do it this
+ * way we would process all the discards first and then write all the data, when
+ * in fact we want to do the data and the discard in the order that they
+ * completed.
+ */
+#define LOG_FLUSH_FLAG (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG (1 << 3)
+
+#define WRITE_LOG_VERSION 1
+#define WRITE_LOG_MAGIC 0x6a736677736872
+
+/*
+ * The disk format for this is braindead simple.
+ *
+ * At byte 0 we have our super, followed by the following sequence for
+ * nr_entries:
+ *
+ * [ 1 sector ][ entry->nr_sectors ]
+ * [log_write_entry][ data written ]
+ *
+ * The log_write_entry takes up a full sector so we can have arbitrary length
+ * marks and it leaves us room for extra content in the future.
+ */
+
+/*
+ * Basic info about the log for userspace.
+ */
+struct log_write_super {
+ __le64 magic;
+ __le64 version;
+ __le64 nr_entries;
+ __le32 sectorsize;
+};
+
+/*
+ * sector - the sector we wrote.
+ * nr_sectors - the number of sectors we wrote.
+ * flags - flags for this log entry.
+ * data_len - the size of the data in this log entry, this is for private log
+ * entry stuff, the MARK data provided by userspace for example.
+ */
+struct log_write_entry {
+ __le64 sector;
+ __le64 nr_sectors;
+ __le64 flags;
+ __le64 data_len;
+};
+
+struct log_writes_c {
+ struct dm_dev *dev;
+ struct dm_dev *logdev;
+ u64 logged_entries;
+ u32 sectorsize;
+ atomic_t io_blocks;
+ atomic_t pending_blocks;
+ sector_t next_sector;
+ sector_t end_sector;
+ bool logging_enabled;
+ bool device_supports_discard;
+ spinlock_t blocks_lock;
+ struct list_head unflushed_blocks;
+ struct list_head logging_blocks;
+ wait_queue_head_t wait;
+ struct task_struct *log_kthread;
+};
+
+struct pending_block {
+ int vec_cnt;
+ u64 flags;
+ sector_t sector;
+ sector_t nr_sectors;
+ char *data;
+ u32 datalen;
+ struct list_head list;
+ struct bio_vec vecs[0];
+};
+
+struct per_bio_data {
+ struct pending_block *block;
+};
+
+static void put_pending_block(struct log_writes_c *lc)
+{
+ if (atomic_dec_and_test(&lc->pending_blocks)) {
+ smp_mb__after_atomic();
+ if (waitqueue_active(&lc->wait))
+ wake_up(&lc->wait);
+ }
+}
+
+static void put_io_block(struct log_writes_c *lc)
+{
+ if (atomic_dec_and_test(&lc->io_blocks)) {
+ smp_mb__after_atomic();
+ if (waitqueue_active(&lc->wait))
+ wake_up(&lc->wait);
+ }
+}
+
+static void log_end_io(struct bio *bio, int err)
+{
+ struct log_writes_c *lc = bio->bi_private;
+ struct bio_vec *bvec;
+ int i;
+
+ if (err) {
+ unsigned long flags;
+
+ DMERR("Error writing log block, error=%d", err);
+ spin_lock_irqsave(&lc->blocks_lock, flags);
+ lc->logging_enabled = false;
+ spin_unlock_irqrestore(&lc->blocks_lock, flags);
+ }
+
+ bio_for_each_segment_all(bvec, bio, i)
+ __free_page(bvec->bv_page);
+
+ put_io_block(lc);
+ bio_put(bio);
+}
+
+/*
+ * Meant to be called if there is an error, it will free all the pages
+ * associated with the block.
+ */
+static void free_pending_block(struct log_writes_c *lc,
+ struct pending_block *block)
+{
+ int i;
+
+ for (i = 0; i < block->vec_cnt; i++) {
+ if (block->vecs[i].bv_page)
+ __free_page(block->vecs[i].bv_page);
+ }
+ kfree(block->data);
+ kfree(block);
+ put_pending_block(lc);
+}
+
+static int write_metadata(struct log_writes_c *lc, void *entry,
+ size_t entrylen, void *data, size_t datalen,
+ sector_t sector)
+{
+ struct bio *bio;
+ struct page *page;
+ void *ptr;
+ size_t ret;
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (!bio) {
+ DMERR("Couldn't alloc log bio");
+ goto error;
+ }
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = lc->logdev->bdev;
+ bio->bi_end_io = log_end_io;
+ bio->bi_private = lc;
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ DMERR("Couldn't alloc log page");
+ bio_put(bio);
+ goto error;
+ }
+
+ ptr = kmap_atomic(page);
+ memcpy(ptr, entry, entrylen);
+ if (datalen)
+ memcpy(ptr + entrylen, data, datalen);
+ memset(ptr + entrylen + datalen, 0,
+ lc->sectorsize - entrylen - datalen);
+ kunmap_atomic(ptr);
+
+ ret = bio_add_page(bio, page, lc->sectorsize, 0);
+ if (ret != lc->sectorsize) {
+ DMERR("Couldn't add page to the log block");
+ goto error_bio;
+ }
+ submit_bio(WRITE, bio);
+ return 0;
+error_bio:
+ bio_put(bio);
+ __free_page(page);
+error:
+ put_io_block(lc);
+ return -1;
+}
+
+static int log_one_block(struct log_writes_c *lc,
+ struct pending_block *block, sector_t sector)
+{
+ struct bio *bio;
+ struct log_write_entry entry;
+ size_t ret;
+ int i;
+
+ entry.sector = cpu_to_le64(block->sector);
+ entry.nr_sectors = cpu_to_le64(block->nr_sectors);
+ entry.flags = cpu_to_le64(block->flags);
+ entry.data_len = cpu_to_le64(block->datalen);
+ if (write_metadata(lc, &entry, sizeof(entry), block->data,
+ block->datalen, sector)) {
+ free_pending_block(lc, block);
+ return -1;
+ }
+
+ if (!block->vec_cnt)
+ goto out;
+ sector++;
+
+ bio = bio_alloc(GFP_KERNEL, block->vec_cnt);
+ if (!bio) {
+ DMERR("Couldn't alloc log bio");
+ goto error;
+ }
+ atomic_inc(&lc->io_blocks);
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = lc->logdev->bdev;
+ bio->bi_end_io = log_end_io;
+ bio->bi_private = lc;
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ for (i = 0; i < block->vec_cnt; i++) {
+ /*
+ * The page offset is always 0 because we allocate a new page
+ * for every bvec in the original bio for simplicity sake.
+ */
+ ret = bio_add_page(bio, block->vecs[i].bv_page,
+ block->vecs[i].bv_len, 0);
+ if (ret != block->vecs[i].bv_len) {
+ atomic_inc(&lc->io_blocks);
+ submit_bio(WRITE, bio);
+ bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i);
+ if (!bio) {
+ DMERR("Couldn't alloc log bio");
+ goto error;
+ }
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = lc->logdev->bdev;
+ bio->bi_end_io = log_end_io;
+ bio->bi_private = lc;
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ ret = bio_add_page(bio, block->vecs[i].bv_page,
+ block->vecs[i].bv_len, 0);
+ if (ret != block->vecs[i].bv_len) {
+ DMERR("Couldn't add page on new bio?");
+ bio_put(bio);
+ goto error;
+ }
+ }
+ sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
+ }
+ submit_bio(WRITE, bio);
+out:
+ kfree(block->data);
+ kfree(block);
+ put_pending_block(lc);
+ return 0;
+error:
+ free_pending_block(lc, block);
+ put_io_block(lc);
+ return -1;
+}
+
+static int log_super(struct log_writes_c *lc)
+{
+ struct log_write_super super;
+
+ super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
+ super.version = cpu_to_le64(WRITE_LOG_VERSION);
+ super.nr_entries = cpu_to_le64(lc->logged_entries);
+ super.sectorsize = cpu_to_le32(lc->sectorsize);
+
+ if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+ DMERR("Couldn't write super");
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline sector_t logdev_last_sector(struct log_writes_c *lc)
+{
+ return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static int log_writes_kthread(void *arg)
+{
+ struct log_writes_c *lc = (struct log_writes_c *)arg;
+ sector_t sector = 0;
+
+ while (!kthread_should_stop()) {
+ bool super = false;
+ bool logging_enabled;
+ struct pending_block *block = NULL;
+ int ret;
+
+ spin_lock_irq(&lc->blocks_lock);
+ if (!list_empty(&lc->logging_blocks)) {
+ block = list_first_entry(&lc->logging_blocks,
+ struct pending_block, list);
+ list_del_init(&block->list);
+ if (!lc->logging_enabled)
+ goto next;
+
+ sector = lc->next_sector;
+ if (block->flags & LOG_DISCARD_FLAG)
+ lc->next_sector++;
+ else
+ lc->next_sector += block->nr_sectors + 1;
+
+ /*
+ * Apparently the size of the device may not be known
+ * right away, so handle this properly.
+ */
+ if (!lc->end_sector)
+ lc->end_sector = logdev_last_sector(lc);
+ if (lc->end_sector &&
+ lc->next_sector >= lc->end_sector) {
+ DMERR("Ran out of space on the logdev");
+ lc->logging_enabled = false;
+ goto next;
+ }
+ lc->logged_entries++;
+ atomic_inc(&lc->io_blocks);
+
+ super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
+ if (super)
+ atomic_inc(&lc->io_blocks);
+ }
+next:
+ logging_enabled = lc->logging_enabled;
+ spin_unlock_irq(&lc->blocks_lock);
+ if (block) {
+ if (logging_enabled) {
+ ret = log_one_block(lc, block, sector);
+ if (!ret && super)
+ ret = log_super(lc);
+ if (ret) {
+ spin_lock_irq(&lc->blocks_lock);
+ lc->logging_enabled = false;
+ spin_unlock_irq(&lc->blocks_lock);
+ }
+ } else
+ free_pending_block(lc, block);
+ continue;
+ }
+
+ if (!try_to_freeze()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop() &&
+ !atomic_read(&lc->pending_blocks))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Construct a log-writes mapping:
+ * log-writes <dev_path> <log_dev_path>
+ */
+static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct log_writes_c *lc;
+ struct dm_arg_set as;
+ const char *devname, *logdevname;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ if (argc < 2) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
+ if (!lc) {
+ ti->error = "Cannot allocate context";
+ return -ENOMEM;
+ }
+ spin_lock_init(&lc->blocks_lock);
+ INIT_LIST_HEAD(&lc->unflushed_blocks);
+ INIT_LIST_HEAD(&lc->logging_blocks);
+ init_waitqueue_head(&lc->wait);
+ lc->sectorsize = 1 << SECTOR_SHIFT;
+ atomic_set(&lc->io_blocks, 0);
+ atomic_set(&lc->pending_blocks, 0);
+
+ devname = dm_shift_arg(&as);
+ if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) {
+ ti->error = "Device lookup failed";
+ goto bad;
+ }
+
+ logdevname = dm_shift_arg(&as);
+ if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) {
+ ti->error = "Log device lookup failed";
+ dm_put_device(ti, lc->dev);
+ goto bad;
+ }
+
+ lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
+ if (!lc->log_kthread) {
+ ti->error = "Couldn't alloc kthread";
+ dm_put_device(ti, lc->dev);
+ dm_put_device(ti, lc->logdev);
+ goto bad;
+ }
+
+ /* We put the super at sector 0, start logging at sector 1 */
+ lc->next_sector = 1;
+ lc->logging_enabled = true;
+ lc->end_sector = logdev_last_sector(lc);
+ lc->device_supports_discard = true;
+
+ ti->num_flush_bios = 1;
+ ti->flush_supported = true;
+ ti->num_discard_bios = 1;
+ ti->discards_supported = true;
+ ti->per_bio_data_size = sizeof(struct per_bio_data);
+ ti->private = lc;
+ return 0;
+
+bad:
+ kfree(lc);
+ return -EINVAL;
+}
+
+static int log_mark(struct log_writes_c *lc, char *data)
+{
+ struct pending_block *block;
+ size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
+
+ block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
+ if (!block) {
+ DMERR("Error allocating pending block");
+ return -ENOMEM;
+ }
+
+ block->data = kstrndup(data, maxsize, GFP_KERNEL);
+ if (!block->data) {
+ DMERR("Error copying mark data");
+ kfree(block);
+ return -ENOMEM;
+ }
+ atomic_inc(&lc->pending_blocks);
+ block->datalen = strlen(block->data);
+ block->flags |= LOG_MARK_FLAG;
+ spin_lock_irq(&lc->blocks_lock);
+ list_add_tail(&block->list, &lc->logging_blocks);
+ spin_unlock_irq(&lc->blocks_lock);
+ wake_up_process(lc->log_kthread);
+ return 0;
+}
+
+static void log_writes_dtr(struct dm_target *ti)
+{
+ struct log_writes_c *lc = ti->private;
+
+ spin_lock_irq(&lc->blocks_lock);
+ list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
+ spin_unlock_irq(&lc->blocks_lock);
+
+ /*
+ * This is just nice to have since it'll update the super to include the
+ * unflushed blocks, if it fails we don't really care.
+ */
+ log_mark(lc, "dm-log-writes-end");
+ wake_up_process(lc->log_kthread);
+ wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
+ !atomic_read(&lc->pending_blocks));
+ kthread_stop(lc->log_kthread);
+
+ WARN_ON(!list_empty(&lc->logging_blocks));
+ WARN_ON(!list_empty(&lc->unflushed_blocks));
+ dm_put_device(ti, lc->dev);
+ dm_put_device(ti, lc->logdev);
+ kfree(lc);
+}
+
+static void normal_map_bio(struct dm_target *ti, struct bio *bio)
+{
+ struct log_writes_c *lc = ti->private;
+
+ bio->bi_bdev = lc->dev->bdev;
+}
+
+static int log_writes_map(struct dm_target *ti, struct bio *bio)
+{
+ struct log_writes_c *lc = ti->private;
+ struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+ struct pending_block *block;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ size_t alloc_size;
+ int i = 0;
+ bool flush_bio = (bio->bi_rw & REQ_FLUSH);
+ bool fua_bio = (bio->bi_rw & REQ_FUA);
+ bool discard_bio = (bio->bi_rw & REQ_DISCARD);
+
+ pb->block = NULL;
+
+ /* Don't bother doing anything if logging has been disabled */
+ if (!lc->logging_enabled)
+ goto map_bio;
+
+ /*
+ * Map reads as normal.
+ */
+ if (bio_data_dir(bio) == READ)
+ goto map_bio;
+
+ /* No sectors and not a flush? Don't care */
+ if (!bio_sectors(bio) && !flush_bio)
+ goto map_bio;
+
+ /*
+ * Discards will have bi_size set but there's no actual data, so just
+ * allocate the size of the pending block.
+ */
+ if (discard_bio)
+ alloc_size = sizeof(struct pending_block);
+ else
+ alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
+
+ block = kzalloc(alloc_size, GFP_NOIO);
+ if (!block) {
+ DMERR("Error allocating pending block");
+ spin_lock_irq(&lc->blocks_lock);
+ lc->logging_enabled = false;
+ spin_unlock_irq(&lc->blocks_lock);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&block->list);
+ pb->block = block;
+ atomic_inc(&lc->pending_blocks);
+
+ if (flush_bio)
+ block->flags |= LOG_FLUSH_FLAG;
+ if (fua_bio)
+ block->flags |= LOG_FUA_FLAG;
+ if (discard_bio)
+ block->flags |= LOG_DISCARD_FLAG;
+
+ block->sector = bio->bi_iter.bi_sector;
+ block->nr_sectors = bio_sectors(bio);
+
+ /* We don't need the data, just submit */
+ if (discard_bio) {
+ WARN_ON(flush_bio || fua_bio);
+ if (lc->device_supports_discard)
+ goto map_bio;
+ bio_endio(bio, 0);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /* Flush bio, splice the unflushed blocks onto this list and submit */
+ if (flush_bio && !bio_sectors(bio)) {
+ spin_lock_irq(&lc->blocks_lock);
+ list_splice_init(&lc->unflushed_blocks, &block->list);
+ spin_unlock_irq(&lc->blocks_lock);
+ goto map_bio;
+ }
+
+ /*
+ * We will write this bio somewhere else way later so we need to copy
+ * the actual contents into new pages so we know the data will always be
+ * there.
+ *
+ * We do this because this could be a bio from O_DIRECT in which case we
+ * can't just hold onto the page until some later point, we have to
+ * manually copy the contents.
+ */
+ bio_for_each_segment(bv, bio, iter) {
+ struct page *page;
+ void *src, *dst;
+
+ page = alloc_page(GFP_NOIO);
+ if (!page) {
+ DMERR("Error allocing page");
+ free_pending_block(lc, block);
+ spin_lock_irq(&lc->blocks_lock);
+ lc->logging_enabled = false;
+ spin_unlock_irq(&lc->blocks_lock);
+ return -ENOMEM;
+ }
+
+ src = kmap_atomic(bv.bv_page);
+ dst = kmap_atomic(page);
+ memcpy(dst, src + bv.bv_offset, bv.bv_len);
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+ block->vecs[i].bv_page = page;
+ block->vecs[i].bv_len = bv.bv_len;
+ block->vec_cnt++;
+ i++;
+ }
+
+ /* Had a flush with data in it, weird */
+ if (flush_bio) {
+ spin_lock_irq(&lc->blocks_lock);
+ list_splice_init(&lc->unflushed_blocks, &block->list);
+ spin_unlock_irq(&lc->blocks_lock);
+ }
+map_bio:
+ normal_map_bio(ti, bio);
+ return DM_MAPIO_REMAPPED;
+}
+
+static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+ struct log_writes_c *lc = ti->private;
+ struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+
+ if (bio_data_dir(bio) == WRITE && pb->block) {
+ struct pending_block *block = pb->block;
+ unsigned long flags;
+
+ spin_lock_irqsave(&lc->blocks_lock, flags);
+ if (block->flags & LOG_FLUSH_FLAG) {
+ list_splice_tail_init(&block->list, &lc->logging_blocks);
+ list_add_tail(&block->list, &lc->logging_blocks);
+ wake_up_process(lc->log_kthread);
+ } else if (block->flags & LOG_FUA_FLAG) {
+ list_add_tail(&block->list, &lc->logging_blocks);
+ wake_up_process(lc->log_kthread);
+ } else
+ list_add_tail(&block->list, &lc->unflushed_blocks);
+ spin_unlock_irqrestore(&lc->blocks_lock, flags);
+ }
+
+ return error;
+}
+
+/*
+ * INFO format: <logged entries> <highest allocated sector>
+ */
+static void log_writes_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result,
+ unsigned maxlen)
+{
+ unsigned sz = 0;
+ struct log_writes_c *lc = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%llu %llu", lc->logged_entries,
+ (unsigned long long)lc->next_sector - 1);
+ if (!lc->logging_enabled)
+ DMEMIT(" logging_disabled");
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
+ break;
+ }
+}
+
+static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
+ unsigned long arg)
+{
+ struct log_writes_c *lc = ti->private;
+ struct dm_dev *dev = lc->dev;
+ int r = 0;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+}
+
+static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct log_writes_c *lc = ti->private;
+ struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = lc->dev->bdev;
+ bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int log_writes_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn,
+ void *data)
+{
+ struct log_writes_c *lc = ti->private;
+
+ return fn(ti, lc->dev, 0, ti->len, data);
+}
+
+/*
+ * Messages supported:
+ * mark <mark data> - specify the marked data.
+ */
+static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r = -EINVAL;
+ struct log_writes_c *lc = ti->private;
+
+ if (argc != 2) {
+ DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
+ return r;
+ }
+
+ if (!strcasecmp(argv[0], "mark"))
+ r = log_mark(lc, argv[1]);
+ else
+ DMWARN("Unrecognised log writes target message received: %s", argv[0]);
+
+ return r;
+}
+
+static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct log_writes_c *lc = ti->private;
+ struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+ if (!q || !blk_queue_discard(q)) {
+ lc->device_supports_discard = false;
+ limits->discard_granularity = 1 << SECTOR_SHIFT;
+ limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
+ }
+}
+
+static struct target_type log_writes_target = {
+ .name = "log-writes",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = log_writes_ctr,
+ .dtr = log_writes_dtr,
+ .map = log_writes_map,
+ .end_io = normal_end_io,
+ .status = log_writes_status,
+ .ioctl = log_writes_ioctl,
+ .merge = log_writes_merge,
+ .message = log_writes_message,
+ .iterate_devices = log_writes_iterate_devices,
+ .io_hints = log_writes_io_hints,
+};
+
+static int __init dm_log_writes_init(void)
+{
+ int r = dm_register_target(&log_writes_target);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_log_writes_exit(void)
+{
+ dm_unregister_target(&log_writes_target);
+}
+
+module_init(dm_log_writes_init);
+module_exit(dm_log_writes_exit);
+
+MODULE_DESCRIPTION(DM_NAME " log writes target");
+MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
new file mode 100644
index 000000000..627d19186
--- /dev/null
+++ b/drivers/md/dm-log.c
@@ -0,0 +1,888 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/dm-io.h>
+#include <linux/dm-dirty-log.h>
+
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "dirty region log"
+
+static LIST_HEAD(_log_types);
+static DEFINE_SPINLOCK(_lock);
+
+static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
+{
+ struct dm_dirty_log_type *log_type;
+
+ list_for_each_entry(log_type, &_log_types, list)
+ if (!strcmp(name, log_type->name))
+ return log_type;
+
+ return NULL;
+}
+
+static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
+{
+ struct dm_dirty_log_type *log_type;
+
+ spin_lock(&_lock);
+
+ log_type = __find_dirty_log_type(name);
+ if (log_type && !try_module_get(log_type->module))
+ log_type = NULL;
+
+ spin_unlock(&_lock);
+
+ return log_type;
+}
+
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dm_dirty_log_type by name. If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Log modules are named "dm-log-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-log-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-disk", it would search
+ * 'dm-log-clustered-disk' then 'dm-log-clustered'.
+ *
+ * Returns: dirty_log_type* on success, NULL on failure
+ */
+static struct dm_dirty_log_type *get_type(const char *type_name)
+{
+ char *p, *type_name_dup;
+ struct dm_dirty_log_type *log_type;
+
+ if (!type_name)
+ return NULL;
+
+ log_type = _get_dirty_log_type(type_name);
+ if (log_type)
+ return log_type;
+
+ type_name_dup = kstrdup(type_name, GFP_KERNEL);
+ if (!type_name_dup) {
+ DMWARN("No memory left to attempt log module load for \"%s\"",
+ type_name);
+ return NULL;
+ }
+
+ while (request_module("dm-log-%s", type_name_dup) ||
+ !(log_type = _get_dirty_log_type(type_name))) {
+ p = strrchr(type_name_dup, '-');
+ if (!p)
+ break;
+ p[0] = '\0';
+ }
+
+ if (!log_type)
+ DMWARN("Module for logging type \"%s\" not found.", type_name);
+
+ kfree(type_name_dup);
+
+ return log_type;
+}
+
+static void put_type(struct dm_dirty_log_type *type)
+{
+ if (!type)
+ return;
+
+ spin_lock(&_lock);
+ if (!__find_dirty_log_type(type->name))
+ goto out;
+
+ module_put(type->module);
+
+out:
+ spin_unlock(&_lock);
+}
+
+int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
+{
+ int r = 0;
+
+ spin_lock(&_lock);
+ if (!__find_dirty_log_type(type->name))
+ list_add(&type->list, &_log_types);
+ else
+ r = -EEXIST;
+ spin_unlock(&_lock);
+
+ return r;
+}
+EXPORT_SYMBOL(dm_dirty_log_type_register);
+
+int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
+{
+ spin_lock(&_lock);
+
+ if (!__find_dirty_log_type(type->name)) {
+ spin_unlock(&_lock);
+ return -EINVAL;
+ }
+
+ list_del(&type->list);
+
+ spin_unlock(&_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(dm_dirty_log_type_unregister);
+
+struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
+ struct dm_target *ti,
+ int (*flush_callback_fn)(struct dm_target *ti),
+ unsigned int argc, char **argv)
+{
+ struct dm_dirty_log_type *type;
+ struct dm_dirty_log *log;
+
+ log = kmalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ return NULL;
+
+ type = get_type(type_name);
+ if (!type) {
+ kfree(log);
+ return NULL;
+ }
+
+ log->flush_callback_fn = flush_callback_fn;
+ log->type = type;
+ if (type->ctr(log, ti, argc, argv)) {
+ kfree(log);
+ put_type(type);
+ return NULL;
+ }
+
+ return log;
+}
+EXPORT_SYMBOL(dm_dirty_log_create);
+
+void dm_dirty_log_destroy(struct dm_dirty_log *log)
+{
+ log->type->dtr(log);
+ put_type(log->type);
+ kfree(log);
+}
+EXPORT_SYMBOL(dm_dirty_log_destroy);
+
+/*-----------------------------------------------------------------
+ * Persistent and core logs share a lot of their implementation.
+ * FIXME: need a reload method to be called from a resume
+ *---------------------------------------------------------------*/
+/*
+ * Magic for persistent mirrors: "MiRr"
+ */
+#define MIRROR_MAGIC 0x4D695272
+
+/*
+ * The on-disk version of the metadata.
+ */
+#define MIRROR_DISK_VERSION 2
+#define LOG_OFFSET 2
+
+struct log_header_disk {
+ __le32 magic;
+
+ /*
+ * Simple, incrementing version. no backward
+ * compatibility.
+ */
+ __le32 version;
+ __le64 nr_regions;
+} __packed;
+
+struct log_header_core {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t nr_regions;
+};
+
+struct log_c {
+ struct dm_target *ti;
+ int touched_dirtied;
+ int touched_cleaned;
+ int flush_failed;
+ uint32_t region_size;
+ unsigned int region_count;
+ region_t sync_count;
+
+ unsigned bitset_uint32_count;
+ uint32_t *clean_bits;
+ uint32_t *sync_bits;
+ uint32_t *recovering_bits; /* FIXME: this seems excessive */
+
+ int sync_search;
+
+ /* Resync flag */
+ enum sync {
+ DEFAULTSYNC, /* Synchronize if necessary */
+ NOSYNC, /* Devices known to be already in sync */
+ FORCESYNC, /* Force a sync to happen */
+ } sync;
+
+ struct dm_io_request io_req;
+
+ /*
+ * Disk log fields
+ */
+ int log_dev_failed;
+ int log_dev_flush_failed;
+ struct dm_dev *log_dev;
+ struct log_header_core header;
+
+ struct dm_io_region header_location;
+ struct log_header_disk *disk_header;
+};
+
+/*
+ * The touched member needs to be updated every time we access
+ * one of the bitsets.
+ */
+static inline int log_test_bit(uint32_t *bs, unsigned bit)
+{
+ return test_bit_le(bit, bs) ? 1 : 0;
+}
+
+static inline void log_set_bit(struct log_c *l,
+ uint32_t *bs, unsigned bit)
+{
+ __set_bit_le(bit, bs);
+ l->touched_cleaned = 1;
+}
+
+static inline void log_clear_bit(struct log_c *l,
+ uint32_t *bs, unsigned bit)
+{
+ __clear_bit_le(bit, bs);
+ l->touched_dirtied = 1;
+}
+
+/*----------------------------------------------------------------
+ * Header IO
+ *--------------------------------------------------------------*/
+static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk)
+{
+ disk->magic = cpu_to_le32(core->magic);
+ disk->version = cpu_to_le32(core->version);
+ disk->nr_regions = cpu_to_le64(core->nr_regions);
+}
+
+static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk)
+{
+ core->magic = le32_to_cpu(disk->magic);
+ core->version = le32_to_cpu(disk->version);
+ core->nr_regions = le64_to_cpu(disk->nr_regions);
+}
+
+static int rw_header(struct log_c *lc, int rw)
+{
+ lc->io_req.bi_rw = rw;
+
+ return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
+}
+
+static int flush_header(struct log_c *lc)
+{
+ struct dm_io_region null_location = {
+ .bdev = lc->header_location.bdev,
+ .sector = 0,
+ .count = 0,
+ };
+
+ lc->io_req.bi_rw = WRITE_FLUSH;
+
+ return dm_io(&lc->io_req, 1, &null_location, NULL);
+}
+
+static int read_header(struct log_c *log)
+{
+ int r;
+
+ r = rw_header(log, READ);
+ if (r)
+ return r;
+
+ header_from_disk(&log->header, log->disk_header);
+
+ /* New log required? */
+ if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
+ log->header.magic = MIRROR_MAGIC;
+ log->header.version = MIRROR_DISK_VERSION;
+ log->header.nr_regions = 0;
+ }
+
+#ifdef __LITTLE_ENDIAN
+ if (log->header.version == 1)
+ log->header.version = 2;
+#endif
+
+ if (log->header.version != MIRROR_DISK_VERSION) {
+ DMWARN("incompatible disk log version");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int _check_region_size(struct dm_target *ti, uint32_t region_size)
+{
+ if (region_size < 2 || region_size > ti->len)
+ return 0;
+
+ if (!is_power_of_2(region_size))
+ return 0;
+
+ return 1;
+}
+
+/*----------------------------------------------------------------
+ * core log constructor/destructor
+ *
+ * argv contains region_size followed optionally by [no]sync
+ *--------------------------------------------------------------*/
+#define BYTE_SHIFT 3
+static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
+ unsigned int argc, char **argv,
+ struct dm_dev *dev)
+{
+ enum sync sync = DEFAULTSYNC;
+
+ struct log_c *lc;
+ uint32_t region_size;
+ unsigned int region_count;
+ size_t bitset_size, buf_size;
+ int r;
+ char dummy;
+
+ if (argc < 1 || argc > 2) {
+ DMWARN("wrong number of arguments to dirty region log");
+ return -EINVAL;
+ }
+
+ if (argc > 1) {
+ if (!strcmp(argv[1], "sync"))
+ sync = FORCESYNC;
+ else if (!strcmp(argv[1], "nosync"))
+ sync = NOSYNC;
+ else {
+ DMWARN("unrecognised sync argument to "
+ "dirty region log: %s", argv[1]);
+ return -EINVAL;
+ }
+ }
+
+ if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 ||
+ !_check_region_size(ti, region_size)) {
+ DMWARN("invalid region size %s", argv[0]);
+ return -EINVAL;
+ }
+
+ region_count = dm_sector_div_up(ti->len, region_size);
+
+ lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+ if (!lc) {
+ DMWARN("couldn't allocate core log");
+ return -ENOMEM;
+ }
+
+ lc->ti = ti;
+ lc->touched_dirtied = 0;
+ lc->touched_cleaned = 0;
+ lc->flush_failed = 0;
+ lc->region_size = region_size;
+ lc->region_count = region_count;
+ lc->sync = sync;
+
+ /*
+ * Work out how many "unsigned long"s we need to hold the bitset.
+ */
+ bitset_size = dm_round_up(region_count,
+ sizeof(*lc->clean_bits) << BYTE_SHIFT);
+ bitset_size >>= BYTE_SHIFT;
+
+ lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
+
+ /*
+ * Disk log?
+ */
+ if (!dev) {
+ lc->clean_bits = vmalloc(bitset_size);
+ if (!lc->clean_bits) {
+ DMWARN("couldn't allocate clean bitset");
+ kfree(lc);
+ return -ENOMEM;
+ }
+ lc->disk_header = NULL;
+ } else {
+ lc->log_dev = dev;
+ lc->log_dev_failed = 0;
+ lc->log_dev_flush_failed = 0;
+ lc->header_location.bdev = lc->log_dev->bdev;
+ lc->header_location.sector = 0;
+
+ /*
+ * Buffer holds both header and bitset.
+ */
+ buf_size =
+ dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
+ bdev_logical_block_size(lc->header_location.
+ bdev));
+
+ if (buf_size > i_size_read(dev->bdev->bd_inode)) {
+ DMWARN("log device %s too small: need %llu bytes",
+ dev->name, (unsigned long long)buf_size);
+ kfree(lc);
+ return -EINVAL;
+ }
+
+ lc->header_location.count = buf_size >> SECTOR_SHIFT;
+
+ lc->io_req.mem.type = DM_IO_VMA;
+ lc->io_req.notify.fn = NULL;
+ lc->io_req.client = dm_io_client_create();
+ if (IS_ERR(lc->io_req.client)) {
+ r = PTR_ERR(lc->io_req.client);
+ DMWARN("couldn't allocate disk io client");
+ kfree(lc);
+ return r;
+ }
+
+ lc->disk_header = vmalloc(buf_size);
+ if (!lc->disk_header) {
+ DMWARN("couldn't allocate disk log buffer");
+ dm_io_client_destroy(lc->io_req.client);
+ kfree(lc);
+ return -ENOMEM;
+ }
+
+ lc->io_req.mem.ptr.vma = lc->disk_header;
+ lc->clean_bits = (void *)lc->disk_header +
+ (LOG_OFFSET << SECTOR_SHIFT);
+ }
+
+ memset(lc->clean_bits, -1, bitset_size);
+
+ lc->sync_bits = vmalloc(bitset_size);
+ if (!lc->sync_bits) {
+ DMWARN("couldn't allocate sync bitset");
+ if (!dev)
+ vfree(lc->clean_bits);
+ else
+ dm_io_client_destroy(lc->io_req.client);
+ vfree(lc->disk_header);
+ kfree(lc);
+ return -ENOMEM;
+ }
+ memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
+ lc->sync_count = (sync == NOSYNC) ? region_count : 0;
+
+ lc->recovering_bits = vzalloc(bitset_size);
+ if (!lc->recovering_bits) {
+ DMWARN("couldn't allocate sync bitset");
+ vfree(lc->sync_bits);
+ if (!dev)
+ vfree(lc->clean_bits);
+ else
+ dm_io_client_destroy(lc->io_req.client);
+ vfree(lc->disk_header);
+ kfree(lc);
+ return -ENOMEM;
+ }
+ lc->sync_search = 0;
+ log->context = lc;
+
+ return 0;
+}
+
+static int core_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+ unsigned int argc, char **argv)
+{
+ return create_log_context(log, ti, argc, argv, NULL);
+}
+
+static void destroy_log_context(struct log_c *lc)
+{
+ vfree(lc->sync_bits);
+ vfree(lc->recovering_bits);
+ kfree(lc);
+}
+
+static void core_dtr(struct dm_dirty_log *log)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+
+ vfree(lc->clean_bits);
+ destroy_log_context(lc);
+}
+
+/*----------------------------------------------------------------
+ * disk log constructor/destructor
+ *
+ * argv contains log_device region_size followed optionally by [no]sync
+ *--------------------------------------------------------------*/
+static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+ unsigned int argc, char **argv)
+{
+ int r;
+ struct dm_dev *dev;
+
+ if (argc < 2 || argc > 3) {
+ DMWARN("wrong number of arguments to disk dirty region log");
+ return -EINVAL;
+ }
+
+ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
+ if (r)
+ return r;
+
+ r = create_log_context(log, ti, argc - 1, argv + 1, dev);
+ if (r) {
+ dm_put_device(ti, dev);
+ return r;
+ }
+
+ return 0;
+}
+
+static void disk_dtr(struct dm_dirty_log *log)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+
+ dm_put_device(lc->ti, lc->log_dev);
+ vfree(lc->disk_header);
+ dm_io_client_destroy(lc->io_req.client);
+ destroy_log_context(lc);
+}
+
+static void fail_log_device(struct log_c *lc)
+{
+ if (lc->log_dev_failed)
+ return;
+
+ lc->log_dev_failed = 1;
+ dm_table_event(lc->ti->table);
+}
+
+static int disk_resume(struct dm_dirty_log *log)
+{
+ int r;
+ unsigned i;
+ struct log_c *lc = (struct log_c *) log->context;
+ size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
+
+ /* read the disk header */
+ r = read_header(lc);
+ if (r) {
+ DMWARN("%s: Failed to read header on dirty region log device",
+ lc->log_dev->name);
+ fail_log_device(lc);
+ /*
+ * If the log device cannot be read, we must assume
+ * all regions are out-of-sync. If we simply return
+ * here, the state will be uninitialized and could
+ * lead us to return 'in-sync' status for regions
+ * that are actually 'out-of-sync'.
+ */
+ lc->header.nr_regions = 0;
+ }
+
+ /* set or clear any new bits -- device has grown */
+ if (lc->sync == NOSYNC)
+ for (i = lc->header.nr_regions; i < lc->region_count; i++)
+ /* FIXME: amazingly inefficient */
+ log_set_bit(lc, lc->clean_bits, i);
+ else
+ for (i = lc->header.nr_regions; i < lc->region_count; i++)
+ /* FIXME: amazingly inefficient */
+ log_clear_bit(lc, lc->clean_bits, i);
+
+ /* clear any old bits -- device has shrunk */
+ for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++)
+ log_clear_bit(lc, lc->clean_bits, i);
+
+ /* copy clean across to sync */
+ memcpy(lc->sync_bits, lc->clean_bits, size);
+ lc->sync_count = memweight(lc->clean_bits,
+ lc->bitset_uint32_count * sizeof(uint32_t));
+ lc->sync_search = 0;
+
+ /* set the correct number of regions in the header */
+ lc->header.nr_regions = lc->region_count;
+
+ header_to_disk(&lc->header, lc->disk_header);
+
+ /* write the new header */
+ r = rw_header(lc, WRITE);
+ if (!r) {
+ r = flush_header(lc);
+ if (r)
+ lc->log_dev_flush_failed = 1;
+ }
+ if (r) {
+ DMWARN("%s: Failed to write header on dirty region log device",
+ lc->log_dev->name);
+ fail_log_device(lc);
+ }
+
+ return r;
+}
+
+static uint32_t core_get_region_size(struct dm_dirty_log *log)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+ return lc->region_size;
+}
+
+static int core_resume(struct dm_dirty_log *log)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+ lc->sync_search = 0;
+ return 0;
+}
+
+static int core_is_clean(struct dm_dirty_log *log, region_t region)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+ return log_test_bit(lc->clean_bits, region);
+}
+
+static int core_in_sync(struct dm_dirty_log *log, region_t region, int block)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+ return log_test_bit(lc->sync_bits, region);
+}
+
+static int core_flush(struct dm_dirty_log *log)
+{
+ /* no op */
+ return 0;
+}
+
+static int disk_flush(struct dm_dirty_log *log)
+{
+ int r, i;
+ struct log_c *lc = log->context;
+
+ /* only write if the log has changed */
+ if (!lc->touched_cleaned && !lc->touched_dirtied)
+ return 0;
+
+ if (lc->touched_cleaned && log->flush_callback_fn &&
+ log->flush_callback_fn(lc->ti)) {
+ /*
+ * At this point it is impossible to determine which
+ * regions are clean and which are dirty (without
+ * re-reading the log off disk). So mark all of them
+ * dirty.
+ */
+ lc->flush_failed = 1;
+ for (i = 0; i < lc->region_count; i++)
+ log_clear_bit(lc, lc->clean_bits, i);
+ }
+
+ r = rw_header(lc, WRITE);
+ if (r)
+ fail_log_device(lc);
+ else {
+ if (lc->touched_dirtied) {
+ r = flush_header(lc);
+ if (r) {
+ lc->log_dev_flush_failed = 1;
+ fail_log_device(lc);
+ } else
+ lc->touched_dirtied = 0;
+ }
+ lc->touched_cleaned = 0;
+ }
+
+ return r;
+}
+
+static void core_mark_region(struct dm_dirty_log *log, region_t region)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+ log_clear_bit(lc, lc->clean_bits, region);
+}
+
+static void core_clear_region(struct dm_dirty_log *log, region_t region)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+ if (likely(!lc->flush_failed))
+ log_set_bit(lc, lc->clean_bits, region);
+}
+
+static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+
+ if (lc->sync_search >= lc->region_count)
+ return 0;
+
+ do {
+ *region = find_next_zero_bit_le(lc->sync_bits,
+ lc->region_count,
+ lc->sync_search);
+ lc->sync_search = *region + 1;
+
+ if (*region >= lc->region_count)
+ return 0;
+
+ } while (log_test_bit(lc->recovering_bits, *region));
+
+ log_set_bit(lc, lc->recovering_bits, *region);
+ return 1;
+}
+
+static void core_set_region_sync(struct dm_dirty_log *log, region_t region,
+ int in_sync)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+
+ log_clear_bit(lc, lc->recovering_bits, region);
+ if (in_sync) {
+ log_set_bit(lc, lc->sync_bits, region);
+ lc->sync_count++;
+ } else if (log_test_bit(lc->sync_bits, region)) {
+ lc->sync_count--;
+ log_clear_bit(lc, lc->sync_bits, region);
+ }
+}
+
+static region_t core_get_sync_count(struct dm_dirty_log *log)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+
+ return lc->sync_count;
+}
+
+#define DMEMIT_SYNC \
+ if (lc->sync != DEFAULTSYNC) \
+ DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
+
+static int core_status(struct dm_dirty_log *log, status_type_t status,
+ char *result, unsigned int maxlen)
+{
+ int sz = 0;
+ struct log_c *lc = log->context;
+
+ switch(status) {
+ case STATUSTYPE_INFO:
+ DMEMIT("1 %s", log->type->name);
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %u %u ", log->type->name,
+ lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
+ DMEMIT_SYNC;
+ }
+
+ return sz;
+}
+
+static int disk_status(struct dm_dirty_log *log, status_type_t status,
+ char *result, unsigned int maxlen)
+{
+ int sz = 0;
+ struct log_c *lc = log->context;
+
+ switch(status) {
+ case STATUSTYPE_INFO:
+ DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
+ lc->log_dev_flush_failed ? 'F' :
+ lc->log_dev_failed ? 'D' :
+ 'A');
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %u %s %u ", log->type->name,
+ lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name,
+ lc->region_size);
+ DMEMIT_SYNC;
+ }
+
+ return sz;
+}
+
+static struct dm_dirty_log_type _core_type = {
+ .name = "core",
+ .module = THIS_MODULE,
+ .ctr = core_ctr,
+ .dtr = core_dtr,
+ .resume = core_resume,
+ .get_region_size = core_get_region_size,
+ .is_clean = core_is_clean,
+ .in_sync = core_in_sync,
+ .flush = core_flush,
+ .mark_region = core_mark_region,
+ .clear_region = core_clear_region,
+ .get_resync_work = core_get_resync_work,
+ .set_region_sync = core_set_region_sync,
+ .get_sync_count = core_get_sync_count,
+ .status = core_status,
+};
+
+static struct dm_dirty_log_type _disk_type = {
+ .name = "disk",
+ .module = THIS_MODULE,
+ .ctr = disk_ctr,
+ .dtr = disk_dtr,
+ .postsuspend = disk_flush,
+ .resume = disk_resume,
+ .get_region_size = core_get_region_size,
+ .is_clean = core_is_clean,
+ .in_sync = core_in_sync,
+ .flush = disk_flush,
+ .mark_region = core_mark_region,
+ .clear_region = core_clear_region,
+ .get_resync_work = core_get_resync_work,
+ .set_region_sync = core_set_region_sync,
+ .get_sync_count = core_get_sync_count,
+ .status = disk_status,
+};
+
+static int __init dm_dirty_log_init(void)
+{
+ int r;
+
+ r = dm_dirty_log_type_register(&_core_type);
+ if (r)
+ DMWARN("couldn't register core log");
+
+ r = dm_dirty_log_type_register(&_disk_type);
+ if (r) {
+ DMWARN("couldn't register disk type");
+ dm_dirty_log_type_unregister(&_core_type);
+ }
+
+ return r;
+}
+
+static void __exit dm_dirty_log_exit(void)
+{
+ dm_dirty_log_type_unregister(&_disk_type);
+ dm_dirty_log_type_unregister(&_core_type);
+}
+
+module_init(dm_dirty_log_init);
+module_exit(dm_dirty_log_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dirty region log");
+MODULE_AUTHOR("Joe Thornber, Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
new file mode 100644
index 000000000..eff7bdd77
--- /dev/null
+++ b/drivers/md/dm-mpath.c
@@ -0,0 +1,1793 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include "dm.h"
+#include "dm-path-selector.h"
+#include "dm-uevent.h"
+
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <scsi/scsi_dh.h>
+#include <linux/atomic.h>
+
+#define DM_MSG_PREFIX "multipath"
+#define DM_PG_INIT_DELAY_MSECS 2000
+#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
+
+/* Path properties */
+struct pgpath {
+ struct list_head list;
+
+ struct priority_group *pg; /* Owning PG */
+ unsigned is_active; /* Path status */
+ unsigned fail_count; /* Cumulative failure count */
+
+ struct dm_path path;
+ struct delayed_work activate_path;
+};
+
+#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
+
+/*
+ * Paths are grouped into Priority Groups and numbered from 1 upwards.
+ * Each has a path selector which controls which path gets used.
+ */
+struct priority_group {
+ struct list_head list;
+
+ struct multipath *m; /* Owning multipath instance */
+ struct path_selector ps;
+
+ unsigned pg_num; /* Reference number */
+ unsigned bypassed; /* Temporarily bypass this PG? */
+
+ unsigned nr_pgpaths; /* Number of paths in PG */
+ struct list_head pgpaths;
+};
+
+/* Multipath context */
+struct multipath {
+ struct list_head list;
+ struct dm_target *ti;
+
+ const char *hw_handler_name;
+ char *hw_handler_params;
+
+ spinlock_t lock;
+
+ unsigned nr_priority_groups;
+ struct list_head priority_groups;
+
+ wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+
+ unsigned pg_init_required; /* pg_init needs calling? */
+ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
+ unsigned pg_init_delay_retry; /* Delay pg_init retry? */
+
+ unsigned nr_valid_paths; /* Total number of usable paths */
+ struct pgpath *current_pgpath;
+ struct priority_group *current_pg;
+ struct priority_group *next_pg; /* Switch to this PG if set */
+ unsigned repeat_count; /* I/Os left before calling PS again */
+
+ unsigned queue_io:1; /* Must we queue all I/O? */
+ unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
+ unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
+ unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
+ unsigned pg_init_disabled:1; /* pg_init is not currently allowed */
+
+ unsigned pg_init_retries; /* Number of times to retry pg_init */
+ unsigned pg_init_count; /* Number of times pg_init called */
+ unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
+
+ struct work_struct trigger_event;
+
+ /*
+ * We must use a mempool of dm_mpath_io structs so that we
+ * can resubmit bios on error.
+ */
+ mempool_t *mpio_pool;
+
+ struct mutex work_mutex;
+};
+
+/*
+ * Context information attached to each bio we process.
+ */
+struct dm_mpath_io {
+ struct pgpath *pgpath;
+ size_t nr_bytes;
+};
+
+typedef int (*action_fn) (struct pgpath *pgpath);
+
+static struct kmem_cache *_mpio_cache;
+
+static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
+static void trigger_event(struct work_struct *work);
+static void activate_path(struct work_struct *work);
+static int __pgpath_busy(struct pgpath *pgpath);
+
+
+/*-----------------------------------------------
+ * Allocation routines
+ *-----------------------------------------------*/
+
+static struct pgpath *alloc_pgpath(void)
+{
+ struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
+
+ if (pgpath) {
+ pgpath->is_active = 1;
+ INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
+ }
+
+ return pgpath;
+}
+
+static void free_pgpath(struct pgpath *pgpath)
+{
+ kfree(pgpath);
+}
+
+static struct priority_group *alloc_priority_group(void)
+{
+ struct priority_group *pg;
+
+ pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+
+ if (pg)
+ INIT_LIST_HEAD(&pg->pgpaths);
+
+ return pg;
+}
+
+static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
+{
+ struct pgpath *pgpath, *tmp;
+ struct multipath *m = ti->private;
+
+ list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
+ list_del(&pgpath->list);
+ if (m->hw_handler_name)
+ scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
+ dm_put_device(ti, pgpath->path.dev);
+ free_pgpath(pgpath);
+ }
+}
+
+static void free_priority_group(struct priority_group *pg,
+ struct dm_target *ti)
+{
+ struct path_selector *ps = &pg->ps;
+
+ if (ps->type) {
+ ps->type->destroy(ps);
+ dm_put_path_selector(ps->type);
+ }
+
+ free_pgpaths(&pg->pgpaths, ti);
+ kfree(pg);
+}
+
+static struct multipath *alloc_multipath(struct dm_target *ti)
+{
+ struct multipath *m;
+ unsigned min_ios = dm_get_reserved_rq_based_ios();
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (m) {
+ INIT_LIST_HEAD(&m->priority_groups);
+ spin_lock_init(&m->lock);
+ m->queue_io = 1;
+ m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
+ INIT_WORK(&m->trigger_event, trigger_event);
+ init_waitqueue_head(&m->pg_init_wait);
+ mutex_init(&m->work_mutex);
+ m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
+ if (!m->mpio_pool) {
+ kfree(m);
+ return NULL;
+ }
+ m->ti = ti;
+ ti->private = m;
+ }
+
+ return m;
+}
+
+static void free_multipath(struct multipath *m)
+{
+ struct priority_group *pg, *tmp;
+
+ list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
+ list_del(&pg->list);
+ free_priority_group(pg, m->ti);
+ }
+
+ kfree(m->hw_handler_name);
+ kfree(m->hw_handler_params);
+ mempool_destroy(m->mpio_pool);
+ kfree(m);
+}
+
+static int set_mapinfo(struct multipath *m, union map_info *info)
+{
+ struct dm_mpath_io *mpio;
+
+ mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
+ if (!mpio)
+ return -ENOMEM;
+
+ memset(mpio, 0, sizeof(*mpio));
+ info->ptr = mpio;
+
+ return 0;
+}
+
+static void clear_mapinfo(struct multipath *m, union map_info *info)
+{
+ struct dm_mpath_io *mpio = info->ptr;
+
+ info->ptr = NULL;
+ mempool_free(mpio, m->mpio_pool);
+}
+
+/*-----------------------------------------------
+ * Path selection
+ *-----------------------------------------------*/
+
+static int __pg_init_all_paths(struct multipath *m)
+{
+ struct pgpath *pgpath;
+ unsigned long pg_init_delay = 0;
+
+ if (m->pg_init_in_progress || m->pg_init_disabled)
+ return 0;
+
+ m->pg_init_count++;
+ m->pg_init_required = 0;
+
+ /* Check here to reset pg_init_required */
+ if (!m->current_pg)
+ return 0;
+
+ if (m->pg_init_delay_retry)
+ pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
+ m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
+ list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
+ /* Skip failed paths */
+ if (!pgpath->is_active)
+ continue;
+ if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
+ pg_init_delay))
+ m->pg_init_in_progress++;
+ }
+ return m->pg_init_in_progress;
+}
+
+static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
+{
+ m->current_pg = pgpath->pg;
+
+ /* Must we initialise the PG first, and queue I/O till it's ready? */
+ if (m->hw_handler_name) {
+ m->pg_init_required = 1;
+ m->queue_io = 1;
+ } else {
+ m->pg_init_required = 0;
+ m->queue_io = 0;
+ }
+
+ m->pg_init_count = 0;
+}
+
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
+ size_t nr_bytes)
+{
+ struct dm_path *path;
+
+ path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
+ if (!path)
+ return -ENXIO;
+
+ m->current_pgpath = path_to_pgpath(path);
+
+ if (m->current_pg != pg)
+ __switch_pg(m, m->current_pgpath);
+
+ return 0;
+}
+
+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
+{
+ struct priority_group *pg;
+ unsigned bypassed = 1;
+
+ if (!m->nr_valid_paths) {
+ m->queue_io = 0;
+ goto failed;
+ }
+
+ /* Were we instructed to switch PG? */
+ if (m->next_pg) {
+ pg = m->next_pg;
+ m->next_pg = NULL;
+ if (!__choose_path_in_pg(m, pg, nr_bytes))
+ return;
+ }
+
+ /* Don't change PG until it has no remaining paths */
+ if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
+ return;
+
+ /*
+ * Loop through priority groups until we find a valid path.
+ * First time we skip PGs marked 'bypassed'.
+ * Second time we only try the ones we skipped, but set
+ * pg_init_delay_retry so we do not hammer controllers.
+ */
+ do {
+ list_for_each_entry(pg, &m->priority_groups, list) {
+ if (pg->bypassed == bypassed)
+ continue;
+ if (!__choose_path_in_pg(m, pg, nr_bytes)) {
+ if (!bypassed)
+ m->pg_init_delay_retry = 1;
+ return;
+ }
+ }
+ } while (bypassed--);
+
+failed:
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+}
+
+/*
+ * Check whether bios must be queued in the device-mapper core rather
+ * than here in the target.
+ *
+ * m->lock must be held on entry.
+ *
+ * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
+ * same value then we are not between multipath_presuspend()
+ * and multipath_resume() calls and we have no need to check
+ * for the DMF_NOFLUSH_SUSPENDING flag.
+ */
+static int __must_push_back(struct multipath *m)
+{
+ return (m->queue_if_no_path ||
+ (m->queue_if_no_path != m->saved_queue_if_no_path &&
+ dm_noflush_suspending(m->ti)));
+}
+
+/*
+ * Map cloned requests
+ */
+static int __multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context,
+ struct request *rq, struct request **__clone)
+{
+ struct multipath *m = (struct multipath *) ti->private;
+ int r = DM_MAPIO_REQUEUE;
+ size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
+ struct pgpath *pgpath;
+ struct block_device *bdev;
+ struct dm_mpath_io *mpio;
+
+ spin_lock_irq(&m->lock);
+
+ /* Do we need to select a new pgpath? */
+ if (!m->current_pgpath ||
+ (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
+ __choose_pgpath(m, nr_bytes);
+
+ pgpath = m->current_pgpath;
+
+ if (!pgpath) {
+ if (!__must_push_back(m))
+ r = -EIO; /* Failed */
+ goto out_unlock;
+ } else if (m->queue_io || m->pg_init_required) {
+ __pg_init_all_paths(m);
+ goto out_unlock;
+ }
+
+ if (set_mapinfo(m, map_context) < 0)
+ /* ENOMEM, requeue */
+ goto out_unlock;
+
+ mpio = map_context->ptr;
+ mpio->pgpath = pgpath;
+ mpio->nr_bytes = nr_bytes;
+
+ bdev = pgpath->path.dev->bdev;
+
+ spin_unlock_irq(&m->lock);
+
+ if (clone) {
+ /* Old request-based interface: allocated clone is passed in */
+ clone->q = bdev_get_queue(bdev);
+ clone->rq_disk = bdev->bd_disk;
+ clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ } else {
+ /* blk-mq request-based interface */
+ *__clone = blk_get_request(bdev_get_queue(bdev),
+ rq_data_dir(rq), GFP_ATOMIC);
+ if (IS_ERR(*__clone)) {
+ /* ENOMEM, requeue */
+ clear_mapinfo(m, map_context);
+ return r;
+ }
+ (*__clone)->bio = (*__clone)->biotail = NULL;
+ (*__clone)->rq_disk = bdev->bd_disk;
+ (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ }
+
+ if (pgpath->pg->ps.type->start_io)
+ pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
+ &pgpath->path,
+ nr_bytes);
+ return DM_MAPIO_REMAPPED;
+
+out_unlock:
+ spin_unlock_irq(&m->lock);
+
+ return r;
+}
+
+static int multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context)
+{
+ return __multipath_map(ti, clone, map_context, NULL, NULL);
+}
+
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **clone)
+{
+ return __multipath_map(ti, NULL, map_context, rq, clone);
+}
+
+static void multipath_release_clone(struct request *clone)
+{
+ blk_put_request(clone);
+}
+
+/*
+ * If we run out of usable paths, should we queue I/O or error it?
+ */
+static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
+ unsigned save_old_value)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ if (save_old_value)
+ m->saved_queue_if_no_path = m->queue_if_no_path;
+ else
+ m->saved_queue_if_no_path = queue_if_no_path;
+ m->queue_if_no_path = queue_if_no_path;
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ if (!queue_if_no_path)
+ dm_table_run_md_queue_async(m->ti->table);
+
+ return 0;
+}
+
+/*
+ * An event is triggered whenever a path is taken out of use.
+ * Includes path failure and PG bypass.
+ */
+static void trigger_event(struct work_struct *work)
+{
+ struct multipath *m =
+ container_of(work, struct multipath, trigger_event);
+
+ dm_table_event(m->ti->table);
+}
+
+/*-----------------------------------------------------------------
+ * Constructor/argument parsing:
+ * <#multipath feature args> [<arg>]*
+ * <#hw_handler args> [hw_handler [<arg>]*]
+ * <#priority groups>
+ * <initial priority group>
+ * [<selector> <#selector args> [<arg>]*
+ * <#paths> <#per-path selector args>
+ * [<path> [<arg>]* ]+ ]+
+ *---------------------------------------------------------------*/
+static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
+ struct dm_target *ti)
+{
+ int r;
+ struct path_selector_type *pst;
+ unsigned ps_argc;
+
+ static struct dm_arg _args[] = {
+ {0, 1024, "invalid number of path selector args"},
+ };
+
+ pst = dm_get_path_selector(dm_shift_arg(as));
+ if (!pst) {
+ ti->error = "unknown path selector type";
+ return -EINVAL;
+ }
+
+ r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
+ if (r) {
+ dm_put_path_selector(pst);
+ return -EINVAL;
+ }
+
+ r = pst->create(&pg->ps, ps_argc, as->argv);
+ if (r) {
+ dm_put_path_selector(pst);
+ ti->error = "path selector constructor failed";
+ return r;
+ }
+
+ pg->ps.type = pst;
+ dm_consume_args(as, ps_argc);
+
+ return 0;
+}
+
+static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
+ struct dm_target *ti)
+{
+ int r;
+ struct pgpath *p;
+ struct multipath *m = ti->private;
+ struct request_queue *q = NULL;
+ const char *attached_handler_name;
+
+ /* we need at least a path arg */
+ if (as->argc < 1) {
+ ti->error = "no device given";
+ return ERR_PTR(-EINVAL);
+ }
+
+ p = alloc_pgpath();
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+ &p->path.dev);
+ if (r) {
+ ti->error = "error getting device";
+ goto bad;
+ }
+
+ if (m->retain_attached_hw_handler || m->hw_handler_name)
+ q = bdev_get_queue(p->path.dev->bdev);
+
+ if (m->retain_attached_hw_handler) {
+ attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
+ if (attached_handler_name) {
+ /*
+ * Reset hw_handler_name to match the attached handler
+ * and clear any hw_handler_params associated with the
+ * ignored handler.
+ *
+ * NB. This modifies the table line to show the actual
+ * handler instead of the original table passed in.
+ */
+ kfree(m->hw_handler_name);
+ m->hw_handler_name = attached_handler_name;
+
+ kfree(m->hw_handler_params);
+ m->hw_handler_params = NULL;
+ }
+ }
+
+ if (m->hw_handler_name) {
+ /*
+ * Increments scsi_dh reference, even when using an
+ * already-attached handler.
+ */
+ r = scsi_dh_attach(q, m->hw_handler_name);
+ if (r == -EBUSY) {
+ /*
+ * Already attached to different hw_handler:
+ * try to reattach with correct one.
+ */
+ scsi_dh_detach(q);
+ r = scsi_dh_attach(q, m->hw_handler_name);
+ }
+
+ if (r < 0) {
+ ti->error = "error attaching hardware handler";
+ dm_put_device(ti, p->path.dev);
+ goto bad;
+ }
+
+ if (m->hw_handler_params) {
+ r = scsi_dh_set_params(q, m->hw_handler_params);
+ if (r < 0) {
+ ti->error = "unable to set hardware "
+ "handler parameters";
+ scsi_dh_detach(q);
+ dm_put_device(ti, p->path.dev);
+ goto bad;
+ }
+ }
+ }
+
+ r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
+ if (r) {
+ dm_put_device(ti, p->path.dev);
+ goto bad;
+ }
+
+ return p;
+
+ bad:
+ free_pgpath(p);
+ return ERR_PTR(r);
+}
+
+static struct priority_group *parse_priority_group(struct dm_arg_set *as,
+ struct multipath *m)
+{
+ static struct dm_arg _args[] = {
+ {1, 1024, "invalid number of paths"},
+ {0, 1024, "invalid number of selector args"}
+ };
+
+ int r;
+ unsigned i, nr_selector_args, nr_args;
+ struct priority_group *pg;
+ struct dm_target *ti = m->ti;
+
+ if (as->argc < 2) {
+ as->argc = 0;
+ ti->error = "not enough priority group arguments";
+ return ERR_PTR(-EINVAL);
+ }
+
+ pg = alloc_priority_group();
+ if (!pg) {
+ ti->error = "couldn't allocate priority group";
+ return ERR_PTR(-ENOMEM);
+ }
+ pg->m = m;
+
+ r = parse_path_selector(as, pg, ti);
+ if (r)
+ goto bad;
+
+ /*
+ * read the paths
+ */
+ r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
+ if (r)
+ goto bad;
+
+ r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
+ if (r)
+ goto bad;
+
+ nr_args = 1 + nr_selector_args;
+ for (i = 0; i < pg->nr_pgpaths; i++) {
+ struct pgpath *pgpath;
+ struct dm_arg_set path_args;
+
+ if (as->argc < nr_args) {
+ ti->error = "not enough path parameters";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ path_args.argc = nr_args;
+ path_args.argv = as->argv;
+
+ pgpath = parse_path(&path_args, &pg->ps, ti);
+ if (IS_ERR(pgpath)) {
+ r = PTR_ERR(pgpath);
+ goto bad;
+ }
+
+ pgpath->pg = pg;
+ list_add_tail(&pgpath->list, &pg->pgpaths);
+ dm_consume_args(as, nr_args);
+ }
+
+ return pg;
+
+ bad:
+ free_priority_group(pg, ti);
+ return ERR_PTR(r);
+}
+
+static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
+{
+ unsigned hw_argc;
+ int ret;
+ struct dm_target *ti = m->ti;
+
+ static struct dm_arg _args[] = {
+ {0, 1024, "invalid number of hardware handler args"},
+ };
+
+ if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
+ return -EINVAL;
+
+ if (!hw_argc)
+ return 0;
+
+ m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
+ if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
+ "scsi_dh_%s", m->hw_handler_name)) {
+ ti->error = "unknown hardware handler type";
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ if (hw_argc > 1) {
+ char *p;
+ int i, j, len = 4;
+
+ for (i = 0; i <= hw_argc - 2; i++)
+ len += strlen(as->argv[i]) + 1;
+ p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
+ if (!p) {
+ ti->error = "memory allocation failed";
+ ret = -ENOMEM;
+ goto fail;
+ }
+ j = sprintf(p, "%d", hw_argc - 1);
+ for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
+ j = sprintf(p, "%s", as->argv[i]);
+ }
+ dm_consume_args(as, hw_argc - 1);
+
+ return 0;
+fail:
+ kfree(m->hw_handler_name);
+ m->hw_handler_name = NULL;
+ return ret;
+}
+
+static int parse_features(struct dm_arg_set *as, struct multipath *m)
+{
+ int r;
+ unsigned argc;
+ struct dm_target *ti = m->ti;
+ const char *arg_name;
+
+ static struct dm_arg _args[] = {
+ {0, 6, "invalid number of feature args"},
+ {1, 50, "pg_init_retries must be between 1 and 50"},
+ {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
+ };
+
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ if (!argc)
+ return 0;
+
+ do {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ if (!strcasecmp(arg_name, "queue_if_no_path")) {
+ r = queue_if_no_path(m, 1, 0);
+ continue;
+ }
+
+ if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
+ m->retain_attached_hw_handler = 1;
+ continue;
+ }
+
+ if (!strcasecmp(arg_name, "pg_init_retries") &&
+ (argc >= 1)) {
+ r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
+ argc--;
+ continue;
+ }
+
+ if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
+ (argc >= 1)) {
+ r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
+ argc--;
+ continue;
+ }
+
+ ti->error = "Unrecognised multipath feature request";
+ r = -EINVAL;
+ } while (argc && !r);
+
+ return r;
+}
+
+static int multipath_ctr(struct dm_target *ti, unsigned int argc,
+ char **argv)
+{
+ /* target arguments */
+ static struct dm_arg _args[] = {
+ {0, 1024, "invalid number of priority groups"},
+ {0, 1024, "invalid initial priority group number"},
+ };
+
+ int r;
+ struct multipath *m;
+ struct dm_arg_set as;
+ unsigned pg_count = 0;
+ unsigned next_pg_num;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ m = alloc_multipath(ti);
+ if (!m) {
+ ti->error = "can't allocate multipath";
+ return -EINVAL;
+ }
+
+ r = parse_features(&as, m);
+ if (r)
+ goto bad;
+
+ r = parse_hw_handler(&as, m);
+ if (r)
+ goto bad;
+
+ r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
+ if (r)
+ goto bad;
+
+ r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
+ if (r)
+ goto bad;
+
+ if ((!m->nr_priority_groups && next_pg_num) ||
+ (m->nr_priority_groups && !next_pg_num)) {
+ ti->error = "invalid initial priority group";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ /* parse the priority groups */
+ while (as.argc) {
+ struct priority_group *pg;
+
+ pg = parse_priority_group(&as, m);
+ if (IS_ERR(pg)) {
+ r = PTR_ERR(pg);
+ goto bad;
+ }
+
+ m->nr_valid_paths += pg->nr_pgpaths;
+ list_add_tail(&pg->list, &m->priority_groups);
+ pg_count++;
+ pg->pg_num = pg_count;
+ if (!--next_pg_num)
+ m->next_pg = pg;
+ }
+
+ if (pg_count != m->nr_priority_groups) {
+ ti->error = "priority group count mismatch";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
+ ti->num_write_same_bios = 1;
+
+ return 0;
+
+ bad:
+ free_multipath(m);
+ return r;
+}
+
+static void multipath_wait_for_pg_init_completion(struct multipath *m)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ unsigned long flags;
+
+ add_wait_queue(&m->pg_init_wait, &wait);
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ spin_lock_irqsave(&m->lock, flags);
+ if (!m->pg_init_in_progress) {
+ spin_unlock_irqrestore(&m->lock, flags);
+ break;
+ }
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ io_schedule();
+ }
+ set_current_state(TASK_RUNNING);
+
+ remove_wait_queue(&m->pg_init_wait, &wait);
+}
+
+static void flush_multipath_work(struct multipath *m)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->pg_init_disabled = 1;
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ flush_workqueue(kmpath_handlerd);
+ multipath_wait_for_pg_init_completion(m);
+ flush_workqueue(kmultipathd);
+ flush_work(&m->trigger_event);
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->pg_init_disabled = 0;
+ spin_unlock_irqrestore(&m->lock, flags);
+}
+
+static void multipath_dtr(struct dm_target *ti)
+{
+ struct multipath *m = ti->private;
+
+ flush_multipath_work(m);
+ free_multipath(m);
+}
+
+/*
+ * Take a path out of use.
+ */
+static int fail_path(struct pgpath *pgpath)
+{
+ unsigned long flags;
+ struct multipath *m = pgpath->pg->m;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ if (!pgpath->is_active)
+ goto out;
+
+ DMWARN("Failing path %s.", pgpath->path.dev->name);
+
+ pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
+ pgpath->is_active = 0;
+ pgpath->fail_count++;
+
+ m->nr_valid_paths--;
+
+ if (pgpath == m->current_pgpath)
+ m->current_pgpath = NULL;
+
+ dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
+ pgpath->path.dev->name, m->nr_valid_paths);
+
+ schedule_work(&m->trigger_event);
+
+out:
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ return 0;
+}
+
+/*
+ * Reinstate a previously-failed path
+ */
+static int reinstate_path(struct pgpath *pgpath)
+{
+ int r = 0, run_queue = 0;
+ unsigned long flags;
+ struct multipath *m = pgpath->pg->m;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ if (pgpath->is_active)
+ goto out;
+
+ if (!pgpath->pg->ps.type->reinstate_path) {
+ DMWARN("Reinstate path not supported by path selector %s",
+ pgpath->pg->ps.type->name);
+ r = -EINVAL;
+ goto out;
+ }
+
+ r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
+ if (r)
+ goto out;
+
+ pgpath->is_active = 1;
+
+ if (!m->nr_valid_paths++) {
+ m->current_pgpath = NULL;
+ run_queue = 1;
+ } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
+ if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
+ m->pg_init_in_progress++;
+ }
+
+ dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
+ pgpath->path.dev->name, m->nr_valid_paths);
+
+ schedule_work(&m->trigger_event);
+
+out:
+ spin_unlock_irqrestore(&m->lock, flags);
+ if (run_queue)
+ dm_table_run_md_queue_async(m->ti->table);
+
+ return r;
+}
+
+/*
+ * Fail or reinstate all paths that match the provided struct dm_dev.
+ */
+static int action_dev(struct multipath *m, struct dm_dev *dev,
+ action_fn action)
+{
+ int r = -EINVAL;
+ struct pgpath *pgpath;
+ struct priority_group *pg;
+
+ list_for_each_entry(pg, &m->priority_groups, list) {
+ list_for_each_entry(pgpath, &pg->pgpaths, list) {
+ if (pgpath->path.dev == dev)
+ r = action(pgpath);
+ }
+ }
+
+ return r;
+}
+
+/*
+ * Temporarily try to avoid having to use the specified PG
+ */
+static void bypass_pg(struct multipath *m, struct priority_group *pg,
+ int bypassed)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ pg->bypassed = bypassed;
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ schedule_work(&m->trigger_event);
+}
+
+/*
+ * Switch to using the specified PG from the next I/O that gets mapped
+ */
+static int switch_pg_num(struct multipath *m, const char *pgstr)
+{
+ struct priority_group *pg;
+ unsigned pgnum;
+ unsigned long flags;
+ char dummy;
+
+ if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
+ (pgnum > m->nr_priority_groups)) {
+ DMWARN("invalid PG number supplied to switch_pg_num");
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&m->lock, flags);
+ list_for_each_entry(pg, &m->priority_groups, list) {
+ pg->bypassed = 0;
+ if (--pgnum)
+ continue;
+
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ m->next_pg = pg;
+ }
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ schedule_work(&m->trigger_event);
+ return 0;
+}
+
+/*
+ * Set/clear bypassed status of a PG.
+ * PGs are numbered upwards from 1 in the order they were declared.
+ */
+static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
+{
+ struct priority_group *pg;
+ unsigned pgnum;
+ char dummy;
+
+ if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
+ (pgnum > m->nr_priority_groups)) {
+ DMWARN("invalid PG number supplied to bypass_pg");
+ return -EINVAL;
+ }
+
+ list_for_each_entry(pg, &m->priority_groups, list) {
+ if (!--pgnum)
+ break;
+ }
+
+ bypass_pg(m, pg, bypassed);
+ return 0;
+}
+
+/*
+ * Should we retry pg_init immediately?
+ */
+static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
+{
+ unsigned long flags;
+ int limit_reached = 0;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
+ m->pg_init_required = 1;
+ else
+ limit_reached = 1;
+
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ return limit_reached;
+}
+
+static void pg_init_done(void *data, int errors)
+{
+ struct pgpath *pgpath = data;
+ struct priority_group *pg = pgpath->pg;
+ struct multipath *m = pg->m;
+ unsigned long flags;
+ unsigned delay_retry = 0;
+
+ /* device or driver problems */
+ switch (errors) {
+ case SCSI_DH_OK:
+ break;
+ case SCSI_DH_NOSYS:
+ if (!m->hw_handler_name) {
+ errors = 0;
+ break;
+ }
+ DMERR("Could not failover the device: Handler scsi_dh_%s "
+ "Error %d.", m->hw_handler_name, errors);
+ /*
+ * Fail path for now, so we do not ping pong
+ */
+ fail_path(pgpath);
+ break;
+ case SCSI_DH_DEV_TEMP_BUSY:
+ /*
+ * Probably doing something like FW upgrade on the
+ * controller so try the other pg.
+ */
+ bypass_pg(m, pg, 1);
+ break;
+ case SCSI_DH_RETRY:
+ /* Wait before retrying. */
+ delay_retry = 1;
+ case SCSI_DH_IMM_RETRY:
+ case SCSI_DH_RES_TEMP_UNAVAIL:
+ if (pg_init_limit_reached(m, pgpath))
+ fail_path(pgpath);
+ errors = 0;
+ break;
+ default:
+ /*
+ * We probably do not want to fail the path for a device
+ * error, but this is what the old dm did. In future
+ * patches we can do more advanced handling.
+ */
+ fail_path(pgpath);
+ }
+
+ spin_lock_irqsave(&m->lock, flags);
+ if (errors) {
+ if (pgpath == m->current_pgpath) {
+ DMERR("Could not failover device. Error %d.", errors);
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ }
+ } else if (!m->pg_init_required)
+ pg->bypassed = 0;
+
+ if (--m->pg_init_in_progress)
+ /* Activations of other paths are still on going */
+ goto out;
+
+ if (m->pg_init_required) {
+ m->pg_init_delay_retry = delay_retry;
+ if (__pg_init_all_paths(m))
+ goto out;
+ }
+ m->queue_io = 0;
+
+ /*
+ * Wake up any thread waiting to suspend.
+ */
+ wake_up(&m->pg_init_wait);
+
+out:
+ spin_unlock_irqrestore(&m->lock, flags);
+}
+
+static void activate_path(struct work_struct *work)
+{
+ struct pgpath *pgpath =
+ container_of(work, struct pgpath, activate_path.work);
+
+ if (pgpath->is_active)
+ scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
+ pg_init_done, pgpath);
+ else
+ pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
+}
+
+static int noretry_error(int error)
+{
+ switch (error) {
+ case -EOPNOTSUPP:
+ case -EREMOTEIO:
+ case -EILSEQ:
+ case -ENODATA:
+ case -ENOSPC:
+ return 1;
+ }
+
+ /* Anything else could be a path failure, so should be retried */
+ return 0;
+}
+
+/*
+ * end_io handling
+ */
+static int do_end_io(struct multipath *m, struct request *clone,
+ int error, struct dm_mpath_io *mpio)
+{
+ /*
+ * We don't queue any clone request inside the multipath target
+ * during end I/O handling, since those clone requests don't have
+ * bio clones. If we queue them inside the multipath target,
+ * we need to make bio clones, that requires memory allocation.
+ * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
+ * don't have bio clones.)
+ * Instead of queueing the clone request here, we queue the original
+ * request into dm core, which will remake a clone request and
+ * clone bios for it and resubmit it later.
+ */
+ int r = DM_ENDIO_REQUEUE;
+ unsigned long flags;
+
+ if (!error && !clone->errors)
+ return 0; /* I/O complete */
+
+ if (noretry_error(error))
+ return error;
+
+ if (mpio->pgpath)
+ fail_path(mpio->pgpath);
+
+ spin_lock_irqsave(&m->lock, flags);
+ if (!m->nr_valid_paths) {
+ if (!m->queue_if_no_path) {
+ if (!__must_push_back(m))
+ r = -EIO;
+ } else {
+ if (error == -EBADE)
+ r = error;
+ }
+ }
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ return r;
+}
+
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
+ int error, union map_info *map_context)
+{
+ struct multipath *m = ti->private;
+ struct dm_mpath_io *mpio = map_context->ptr;
+ struct pgpath *pgpath;
+ struct path_selector *ps;
+ int r;
+
+ BUG_ON(!mpio);
+
+ r = do_end_io(m, clone, error, mpio);
+ pgpath = mpio->pgpath;
+ if (pgpath) {
+ ps = &pgpath->pg->ps;
+ if (ps->type->end_io)
+ ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
+ }
+ clear_mapinfo(m, map_context);
+
+ return r;
+}
+
+/*
+ * Suspend can't complete until all the I/O is processed so if
+ * the last path fails we must error any remaining I/O.
+ * Note that if the freeze_bdev fails while suspending, the
+ * queue_if_no_path state is lost - userspace should reset it.
+ */
+static void multipath_presuspend(struct dm_target *ti)
+{
+ struct multipath *m = (struct multipath *) ti->private;
+
+ queue_if_no_path(m, 0, 1);
+}
+
+static void multipath_postsuspend(struct dm_target *ti)
+{
+ struct multipath *m = ti->private;
+
+ mutex_lock(&m->work_mutex);
+ flush_multipath_work(m);
+ mutex_unlock(&m->work_mutex);
+}
+
+/*
+ * Restore the queue_if_no_path setting.
+ */
+static void multipath_resume(struct dm_target *ti)
+{
+ struct multipath *m = (struct multipath *) ti->private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->queue_if_no_path = m->saved_queue_if_no_path;
+ spin_unlock_irqrestore(&m->lock, flags);
+}
+
+/*
+ * Info output has the following format:
+ * num_multipath_feature_args [multipath_feature_args]*
+ * num_handler_status_args [handler_status_args]*
+ * num_groups init_group_number
+ * [A|D|E num_ps_status_args [ps_status_args]*
+ * num_paths num_selector_args
+ * [path_dev A|F fail_count [selector_args]* ]+ ]+
+ *
+ * Table output has the following format (identical to the constructor string):
+ * num_feature_args [features_args]*
+ * num_handler_args hw_handler [hw_handler_args]*
+ * num_groups init_group_number
+ * [priority selector-name num_ps_args [ps_args]*
+ * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
+ */
+static void multipath_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ int sz = 0;
+ unsigned long flags;
+ struct multipath *m = (struct multipath *) ti->private;
+ struct priority_group *pg;
+ struct pgpath *p;
+ unsigned pg_num;
+ char state;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ /* Features */
+ if (type == STATUSTYPE_INFO)
+ DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count);
+ else {
+ DMEMIT("%u ", m->queue_if_no_path +
+ (m->pg_init_retries > 0) * 2 +
+ (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
+ m->retain_attached_hw_handler);
+ if (m->queue_if_no_path)
+ DMEMIT("queue_if_no_path ");
+ if (m->pg_init_retries)
+ DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+ if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
+ DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
+ if (m->retain_attached_hw_handler)
+ DMEMIT("retain_attached_hw_handler ");
+ }
+
+ if (!m->hw_handler_name || type == STATUSTYPE_INFO)
+ DMEMIT("0 ");
+ else
+ DMEMIT("1 %s ", m->hw_handler_name);
+
+ DMEMIT("%u ", m->nr_priority_groups);
+
+ if (m->next_pg)
+ pg_num = m->next_pg->pg_num;
+ else if (m->current_pg)
+ pg_num = m->current_pg->pg_num;
+ else
+ pg_num = (m->nr_priority_groups ? 1 : 0);
+
+ DMEMIT("%u ", pg_num);
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ list_for_each_entry(pg, &m->priority_groups, list) {
+ if (pg->bypassed)
+ state = 'D'; /* Disabled */
+ else if (pg == m->current_pg)
+ state = 'A'; /* Currently Active */
+ else
+ state = 'E'; /* Enabled */
+
+ DMEMIT("%c ", state);
+
+ if (pg->ps.type->status)
+ sz += pg->ps.type->status(&pg->ps, NULL, type,
+ result + sz,
+ maxlen - sz);
+ else
+ DMEMIT("0 ");
+
+ DMEMIT("%u %u ", pg->nr_pgpaths,
+ pg->ps.type->info_args);
+
+ list_for_each_entry(p, &pg->pgpaths, list) {
+ DMEMIT("%s %s %u ", p->path.dev->name,
+ p->is_active ? "A" : "F",
+ p->fail_count);
+ if (pg->ps.type->status)
+ sz += pg->ps.type->status(&pg->ps,
+ &p->path, type, result + sz,
+ maxlen - sz);
+ }
+ }
+ break;
+
+ case STATUSTYPE_TABLE:
+ list_for_each_entry(pg, &m->priority_groups, list) {
+ DMEMIT("%s ", pg->ps.type->name);
+
+ if (pg->ps.type->status)
+ sz += pg->ps.type->status(&pg->ps, NULL, type,
+ result + sz,
+ maxlen - sz);
+ else
+ DMEMIT("0 ");
+
+ DMEMIT("%u %u ", pg->nr_pgpaths,
+ pg->ps.type->table_args);
+
+ list_for_each_entry(p, &pg->pgpaths, list) {
+ DMEMIT("%s ", p->path.dev->name);
+ if (pg->ps.type->status)
+ sz += pg->ps.type->status(&pg->ps,
+ &p->path, type, result + sz,
+ maxlen - sz);
+ }
+ }
+ break;
+ }
+
+ spin_unlock_irqrestore(&m->lock, flags);
+}
+
+static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r = -EINVAL;
+ struct dm_dev *dev;
+ struct multipath *m = (struct multipath *) ti->private;
+ action_fn action;
+
+ mutex_lock(&m->work_mutex);
+
+ if (dm_suspended(ti)) {
+ r = -EBUSY;
+ goto out;
+ }
+
+ if (argc == 1) {
+ if (!strcasecmp(argv[0], "queue_if_no_path")) {
+ r = queue_if_no_path(m, 1, 0);
+ goto out;
+ } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
+ r = queue_if_no_path(m, 0, 0);
+ goto out;
+ }
+ }
+
+ if (argc != 2) {
+ DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
+ goto out;
+ }
+
+ if (!strcasecmp(argv[0], "disable_group")) {
+ r = bypass_pg_num(m, argv[1], 1);
+ goto out;
+ } else if (!strcasecmp(argv[0], "enable_group")) {
+ r = bypass_pg_num(m, argv[1], 0);
+ goto out;
+ } else if (!strcasecmp(argv[0], "switch_group")) {
+ r = switch_pg_num(m, argv[1]);
+ goto out;
+ } else if (!strcasecmp(argv[0], "reinstate_path"))
+ action = reinstate_path;
+ else if (!strcasecmp(argv[0], "fail_path"))
+ action = fail_path;
+ else {
+ DMWARN("Unrecognised multipath message received: %s", argv[0]);
+ goto out;
+ }
+
+ r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
+ if (r) {
+ DMWARN("message: error getting device %s",
+ argv[1]);
+ goto out;
+ }
+
+ r = action_dev(m, dev, action);
+
+ dm_put_device(ti, dev);
+
+out:
+ mutex_unlock(&m->work_mutex);
+ return r;
+}
+
+static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
+ unsigned long arg)
+{
+ struct multipath *m = ti->private;
+ struct pgpath *pgpath;
+ struct block_device *bdev;
+ fmode_t mode;
+ unsigned long flags;
+ int r;
+
+ bdev = NULL;
+ mode = 0;
+ r = 0;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ if (!m->current_pgpath)
+ __choose_pgpath(m, 0);
+
+ pgpath = m->current_pgpath;
+
+ if (pgpath) {
+ bdev = pgpath->path.dev->bdev;
+ mode = pgpath->path.dev->mode;
+ }
+
+ if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
+ r = -ENOTCONN;
+ else if (!bdev)
+ r = -EIO;
+
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
+ int err = scsi_verify_blk_ioctl(NULL, cmd);
+ if (err)
+ r = err;
+ }
+
+ if (r == -ENOTCONN && !fatal_signal_pending(current)) {
+ spin_lock_irqsave(&m->lock, flags);
+ if (!m->current_pg) {
+ /* Path status changed, redo selection */
+ __choose_pgpath(m, 0);
+ }
+ if (m->pg_init_required)
+ __pg_init_all_paths(m);
+ spin_unlock_irqrestore(&m->lock, flags);
+ dm_table_run_md_queue_async(m->ti->table);
+ }
+
+ return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int multipath_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct multipath *m = ti->private;
+ struct priority_group *pg;
+ struct pgpath *p;
+ int ret = 0;
+
+ list_for_each_entry(pg, &m->priority_groups, list) {
+ list_for_each_entry(p, &pg->pgpaths, list) {
+ ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
+ if (ret)
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static int __pgpath_busy(struct pgpath *pgpath)
+{
+ struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
+
+ return blk_lld_busy(q);
+}
+
+/*
+ * We return "busy", only when we can map I/Os but underlying devices
+ * are busy (so even if we map I/Os now, the I/Os will wait on
+ * the underlying queue).
+ * In other words, if we want to kill I/Os or queue them inside us
+ * due to map unavailability, we don't return "busy". Otherwise,
+ * dm core won't give us the I/Os and we can't do what we want.
+ */
+static int multipath_busy(struct dm_target *ti)
+{
+ int busy = 0, has_active = 0;
+ struct multipath *m = ti->private;
+ struct priority_group *pg;
+ struct pgpath *pgpath;
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ /* pg_init in progress or no paths available */
+ if (m->pg_init_in_progress ||
+ (!m->nr_valid_paths && m->queue_if_no_path)) {
+ busy = 1;
+ goto out;
+ }
+ /* Guess which priority_group will be used at next mapping time */
+ if (unlikely(!m->current_pgpath && m->next_pg))
+ pg = m->next_pg;
+ else if (likely(m->current_pg))
+ pg = m->current_pg;
+ else
+ /*
+ * We don't know which pg will be used at next mapping time.
+ * We don't call __choose_pgpath() here to avoid to trigger
+ * pg_init just by busy checking.
+ * So we don't know whether underlying devices we will be using
+ * at next mapping time are busy or not. Just try mapping.
+ */
+ goto out;
+
+ /*
+ * If there is one non-busy active path at least, the path selector
+ * will be able to select it. So we consider such a pg as not busy.
+ */
+ busy = 1;
+ list_for_each_entry(pgpath, &pg->pgpaths, list)
+ if (pgpath->is_active) {
+ has_active = 1;
+
+ if (!__pgpath_busy(pgpath)) {
+ busy = 0;
+ break;
+ }
+ }
+
+ if (!has_active)
+ /*
+ * No active path in this pg, so this pg won't be used and
+ * the current_pg will be changed at next mapping time.
+ * We need to try mapping to determine it.
+ */
+ busy = 0;
+
+out:
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ return busy;
+}
+
+/*-----------------------------------------------------------------
+ * Module setup
+ *---------------------------------------------------------------*/
+static struct target_type multipath_target = {
+ .name = "multipath",
+ .version = {1, 9, 0},
+ .module = THIS_MODULE,
+ .ctr = multipath_ctr,
+ .dtr = multipath_dtr,
+ .map_rq = multipath_map,
+ .clone_and_map_rq = multipath_clone_and_map,
+ .release_clone_rq = multipath_release_clone,
+ .rq_end_io = multipath_end_io,
+ .presuspend = multipath_presuspend,
+ .postsuspend = multipath_postsuspend,
+ .resume = multipath_resume,
+ .status = multipath_status,
+ .message = multipath_message,
+ .ioctl = multipath_ioctl,
+ .iterate_devices = multipath_iterate_devices,
+ .busy = multipath_busy,
+};
+
+static int __init dm_multipath_init(void)
+{
+ int r;
+
+ /* allocate a slab for the dm_ios */
+ _mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
+ if (!_mpio_cache)
+ return -ENOMEM;
+
+ r = dm_register_target(&multipath_target);
+ if (r < 0) {
+ DMERR("register failed %d", r);
+ r = -EINVAL;
+ goto bad_register_target;
+ }
+
+ kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
+ if (!kmultipathd) {
+ DMERR("failed to create workqueue kmpathd");
+ r = -ENOMEM;
+ goto bad_alloc_kmultipathd;
+ }
+
+ /*
+ * A separate workqueue is used to handle the device handlers
+ * to avoid overloading existing workqueue. Overloading the
+ * old workqueue would also create a bottleneck in the
+ * path of the storage hardware device activation.
+ */
+ kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
+ WQ_MEM_RECLAIM);
+ if (!kmpath_handlerd) {
+ DMERR("failed to create workqueue kmpath_handlerd");
+ r = -ENOMEM;
+ goto bad_alloc_kmpath_handlerd;
+ }
+
+ DMINFO("version %u.%u.%u loaded",
+ multipath_target.version[0], multipath_target.version[1],
+ multipath_target.version[2]);
+
+ return 0;
+
+bad_alloc_kmpath_handlerd:
+ destroy_workqueue(kmultipathd);
+bad_alloc_kmultipathd:
+ dm_unregister_target(&multipath_target);
+bad_register_target:
+ kmem_cache_destroy(_mpio_cache);
+
+ return r;
+}
+
+static void __exit dm_multipath_exit(void)
+{
+ destroy_workqueue(kmpath_handlerd);
+ destroy_workqueue(kmultipathd);
+
+ dm_unregister_target(&multipath_target);
+ kmem_cache_destroy(_mpio_cache);
+}
+
+module_init(dm_multipath_init);
+module_exit(dm_multipath_exit);
+
+MODULE_DESCRIPTION(DM_NAME " multipath target");
+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h
new file mode 100644
index 000000000..e230f7196
--- /dev/null
+++ b/drivers/md/dm-mpath.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Multipath.
+ */
+
+#ifndef DM_MPATH_H
+#define DM_MPATH_H
+
+struct dm_dev;
+
+struct dm_path {
+ struct dm_dev *dev; /* Read-only */
+ void *pscontext; /* For path-selector use */
+};
+
+/* Callback for hwh_pg_init_fn to use when complete */
+void dm_pg_init_complete(struct dm_path *path, unsigned err_flags);
+
+#endif
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
new file mode 100644
index 000000000..fa0ccc585
--- /dev/null
+++ b/drivers/md/dm-path-selector.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Path selector registration.
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+
+struct ps_internal {
+ struct path_selector_type pst;
+ struct list_head list;
+};
+
+#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
+
+static LIST_HEAD(_path_selectors);
+static DECLARE_RWSEM(_ps_lock);
+
+static struct ps_internal *__find_path_selector_type(const char *name)
+{
+ struct ps_internal *psi;
+
+ list_for_each_entry(psi, &_path_selectors, list) {
+ if (!strcmp(name, psi->pst.name))
+ return psi;
+ }
+
+ return NULL;
+}
+
+static struct ps_internal *get_path_selector(const char *name)
+{
+ struct ps_internal *psi;
+
+ down_read(&_ps_lock);
+ psi = __find_path_selector_type(name);
+ if (psi && !try_module_get(psi->pst.module))
+ psi = NULL;
+ up_read(&_ps_lock);
+
+ return psi;
+}
+
+struct path_selector_type *dm_get_path_selector(const char *name)
+{
+ struct ps_internal *psi;
+
+ if (!name)
+ return NULL;
+
+ psi = get_path_selector(name);
+ if (!psi) {
+ request_module("dm-%s", name);
+ psi = get_path_selector(name);
+ }
+
+ return psi ? &psi->pst : NULL;
+}
+
+void dm_put_path_selector(struct path_selector_type *pst)
+{
+ struct ps_internal *psi;
+
+ if (!pst)
+ return;
+
+ down_read(&_ps_lock);
+ psi = __find_path_selector_type(pst->name);
+ if (!psi)
+ goto out;
+
+ module_put(psi->pst.module);
+out:
+ up_read(&_ps_lock);
+}
+
+static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
+{
+ struct ps_internal *psi = kzalloc(sizeof(*psi), GFP_KERNEL);
+
+ if (psi)
+ psi->pst = *pst;
+
+ return psi;
+}
+
+int dm_register_path_selector(struct path_selector_type *pst)
+{
+ int r = 0;
+ struct ps_internal *psi = _alloc_path_selector(pst);
+
+ if (!psi)
+ return -ENOMEM;
+
+ down_write(&_ps_lock);
+
+ if (__find_path_selector_type(pst->name)) {
+ kfree(psi);
+ r = -EEXIST;
+ } else
+ list_add(&psi->list, &_path_selectors);
+
+ up_write(&_ps_lock);
+
+ return r;
+}
+
+int dm_unregister_path_selector(struct path_selector_type *pst)
+{
+ struct ps_internal *psi;
+
+ down_write(&_ps_lock);
+
+ psi = __find_path_selector_type(pst->name);
+ if (!psi) {
+ up_write(&_ps_lock);
+ return -EINVAL;
+ }
+
+ list_del(&psi->list);
+
+ up_write(&_ps_lock);
+
+ kfree(psi);
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dm_register_path_selector);
+EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
new file mode 100644
index 000000000..e7d1fa8b0
--- /dev/null
+++ b/drivers/md/dm-path-selector.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Path-Selector registration.
+ */
+
+#ifndef DM_PATH_SELECTOR_H
+#define DM_PATH_SELECTOR_H
+
+#include <linux/device-mapper.h>
+
+#include "dm-mpath.h"
+
+/*
+ * We provide an abstraction for the code that chooses which path
+ * to send some io down.
+ */
+struct path_selector_type;
+struct path_selector {
+ struct path_selector_type *type;
+ void *context;
+};
+
+/* Information about a path selector type */
+struct path_selector_type {
+ char *name;
+ struct module *module;
+
+ unsigned int table_args;
+ unsigned int info_args;
+
+ /*
+ * Constructs a path selector object, takes custom arguments
+ */
+ int (*create) (struct path_selector *ps, unsigned argc, char **argv);
+ void (*destroy) (struct path_selector *ps);
+
+ /*
+ * Add an opaque path object, along with some selector specific
+ * path args (eg, path priority).
+ */
+ int (*add_path) (struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error);
+
+ /*
+ * Chooses a path for this io, if no paths are available then
+ * NULL will be returned.
+ *
+ * repeat_count is the number of times to use the path before
+ * calling the function again. 0 means don't call it again unless
+ * the path fails.
+ */
+ struct dm_path *(*select_path) (struct path_selector *ps,
+ unsigned *repeat_count,
+ size_t nr_bytes);
+
+ /*
+ * Notify the selector that a path has failed.
+ */
+ void (*fail_path) (struct path_selector *ps, struct dm_path *p);
+
+ /*
+ * Ask selector to reinstate a path.
+ */
+ int (*reinstate_path) (struct path_selector *ps, struct dm_path *p);
+
+ /*
+ * Table content based on parameters added in ps_add_path_fn
+ * or path selector status
+ */
+ int (*status) (struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned int maxlen);
+
+ int (*start_io) (struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes);
+ int (*end_io) (struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes);
+};
+
+/* Register a path selector */
+int dm_register_path_selector(struct path_selector_type *type);
+
+/* Unregister a path selector */
+int dm_unregister_path_selector(struct path_selector_type *type);
+
+/* Returns a registered path selector type */
+struct path_selector_type *dm_get_path_selector(const char *name);
+
+/* Releases a path selector */
+void dm_put_path_selector(struct path_selector_type *pst);
+
+#endif
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
new file mode 100644
index 000000000..3941fae0d
--- /dev/null
+++ b/drivers/md/dm-queue-length.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector - choose a path with the least number of
+ * in-flight I/Os.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+
+#define DM_MSG_PREFIX "multipath queue-length"
+#define QL_MIN_IO 128
+#define QL_VERSION "0.1.0"
+
+struct selector {
+ struct list_head valid_paths;
+ struct list_head failed_paths;
+};
+
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned repeat_count;
+ atomic_t qlen; /* the number of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->failed_paths);
+ }
+
+ return s;
+}
+
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector *s = alloc_selector();
+
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+ return 0;
+}
+
+static void ql_free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ kfree(pi);
+ }
+}
+
+static void ql_destroy(struct path_selector *ps)
+{
+ struct selector *s = ps->context;
+
+ ql_free_paths(&s->valid_paths);
+ ql_free_paths(&s->failed_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned maxlen)
+{
+ unsigned sz = 0;
+ struct path_info *pi;
+
+ /* When called with NULL path, return selector status/args. */
+ if (!path)
+ DMEMIT("0 ");
+ else {
+ pi = path->pscontext;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%d ", atomic_read(&pi->qlen));
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u ", pi->repeat_count);
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi;
+ unsigned repeat_count = QL_MIN_IO;
+ char dummy;
+
+ /*
+ * Arguments: [<repeat_count>]
+ * <repeat_count>: The number of I/Os before switching path.
+ * If not given, default (QL_MIN_IO) is used.
+ */
+ if (argc > 1) {
+ *error = "queue-length ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+ *error = "queue-length ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ /* Allocate the path information structure */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "queue-length ps: Error allocating path information";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+ atomic_set(&pi->qlen, 0);
+
+ path->pscontext = pi;
+
+ list_add_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move(&pi->list, &s->failed_paths);
+}
+
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
+static struct dm_path *ql_select_path(struct path_selector *ps,
+ unsigned *repeat_count, size_t nr_bytes)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = NULL, *best = NULL;
+
+ if (list_empty(&s->valid_paths))
+ return NULL;
+
+ /* Change preferred (first in list) path to evenly balance. */
+ list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+ list_for_each_entry(pi, &s->valid_paths, list) {
+ if (!best ||
+ (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+ best = pi;
+
+ if (!atomic_read(&best->qlen))
+ break;
+ }
+
+ if (!best)
+ return NULL;
+
+ *repeat_count = best->repeat_count;
+
+ return best->path;
+}
+
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_inc(&pi->qlen);
+
+ return 0;
+}
+
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_dec(&pi->qlen);
+
+ return 0;
+}
+
+static struct path_selector_type ql_ps = {
+ .name = "queue-length",
+ .module = THIS_MODULE,
+ .table_args = 1,
+ .info_args = 1,
+ .create = ql_create,
+ .destroy = ql_destroy,
+ .status = ql_status,
+ .add_path = ql_add_path,
+ .fail_path = ql_fail_path,
+ .reinstate_path = ql_reinstate_path,
+ .select_path = ql_select_path,
+ .start_io = ql_start_io,
+ .end_io = ql_end_io,
+};
+
+static int __init dm_ql_init(void)
+{
+ int r = dm_register_path_selector(&ql_ps);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ DMINFO("version " QL_VERSION " loaded");
+
+ return r;
+}
+
+static void __exit dm_ql_exit(void)
+{
+ int r = dm_unregister_path_selector(&ql_ps);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+}
+
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+ "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n"
+ DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000..88e4c7f24
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,1748 @@
+/*
+ * Copyright (C) 2010-2011 Neil Brown
+ * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "md.h"
+#include "raid1.h"
+#include "raid5.h"
+#include "raid10.h"
+#include "bitmap.h"
+
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "raid"
+
+static bool devices_handle_discard_safely = false;
+
+/*
+ * The following flags are used by dm-raid.c to set up the array state.
+ * They must be cleared before md_run is called.
+ */
+#define FirstUse 10 /* rdev flag */
+
+struct raid_dev {
+ /*
+ * Two DM devices, one to hold metadata and one to hold the
+ * actual data/parity. The reason for this is to not confuse
+ * ti->len and give more flexibility in altering size and
+ * characteristics.
+ *
+ * While it is possible for this device to be associated
+ * with a different physical device than the data_dev, it
+ * is intended for it to be the same.
+ * |--------- Physical Device ---------|
+ * |- meta_dev -|------ data_dev ------|
+ */
+ struct dm_dev *meta_dev;
+ struct dm_dev *data_dev;
+ struct md_rdev rdev;
+};
+
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_SYNC 0x1
+#define DMPF_NOSYNC 0x2
+#define DMPF_REBUILD 0x4
+#define DMPF_DAEMON_SLEEP 0x8
+#define DMPF_MIN_RECOVERY_RATE 0x10
+#define DMPF_MAX_RECOVERY_RATE 0x20
+#define DMPF_MAX_WRITE_BEHIND 0x40
+#define DMPF_STRIPE_CACHE 0x80
+#define DMPF_REGION_SIZE 0x100
+#define DMPF_RAID10_COPIES 0x200
+#define DMPF_RAID10_FORMAT 0x400
+
+struct raid_set {
+ struct dm_target *ti;
+
+ uint32_t bitmap_loaded;
+ uint32_t print_flags;
+
+ struct mddev md;
+ struct raid_type *raid_type;
+ struct dm_target_callbacks callbacks;
+
+ struct raid_dev dev[0];
+};
+
+/* Supported raid types and properties. */
+static struct raid_type {
+ const char *name; /* RAID algorithm. */
+ const char *descr; /* Descriptor text for logging. */
+ const unsigned parity_devs; /* # of parity devices. */
+ const unsigned minimal_devs; /* minimal # of devices in set. */
+ const unsigned level; /* RAID level. */
+ const unsigned algorithm; /* RAID algorithm. */
+} raid_types[] = {
+ {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
+ {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
+ {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
+ {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+ {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+ {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+ {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+ {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+ {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+ {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+
+static char *raid10_md_layout_to_format(int layout)
+{
+ /*
+ * Bit 16 and 17 stand for "offset" and "use_far_sets"
+ * Refer to MD's raid10.c for details
+ */
+ if ((layout & 0x10000) && (layout & 0x20000))
+ return "offset";
+
+ if ((layout & 0xFF) > 1)
+ return "near";
+
+ return "far";
+}
+
+static unsigned raid10_md_layout_to_copies(int layout)
+{
+ if ((layout & 0xFF) > 1)
+ return layout & 0xFF;
+ return (layout >> 8) & 0xFF;
+}
+
+static int raid10_format_to_md_layout(char *format, unsigned copies)
+{
+ unsigned n = 1, f = 1;
+
+ if (!strcmp("near", format))
+ n = copies;
+ else
+ f = copies;
+
+ if (!strcmp("offset", format))
+ return 0x30000 | (f << 8) | n;
+
+ if (!strcmp("far", format))
+ return 0x20000 | (f << 8) | n;
+
+ return (f << 8) | n;
+}
+
+static struct raid_type *get_raid_type(char *name)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+ if (!strcmp(raid_types[i].name, name))
+ return &raid_types[i];
+
+ return NULL;
+}
+
+static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+{
+ unsigned i;
+ struct raid_set *rs;
+
+ if (raid_devs <= raid_type->parity_devs) {
+ ti->error = "Insufficient number of devices";
+ return ERR_PTR(-EINVAL);
+ }
+
+ rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+ if (!rs) {
+ ti->error = "Cannot allocate raid context";
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mddev_init(&rs->md);
+
+ rs->ti = ti;
+ rs->raid_type = raid_type;
+ rs->md.raid_disks = raid_devs;
+ rs->md.level = raid_type->level;
+ rs->md.new_level = rs->md.level;
+ rs->md.layout = raid_type->algorithm;
+ rs->md.new_layout = rs->md.layout;
+ rs->md.delta_disks = 0;
+ rs->md.recovery_cp = 0;
+
+ for (i = 0; i < raid_devs; i++)
+ md_rdev_init(&rs->dev[i].rdev);
+
+ /*
+ * Remaining items to be initialized by further RAID params:
+ * rs->md.persistent
+ * rs->md.external
+ * rs->md.chunk_sectors
+ * rs->md.new_chunk_sectors
+ * rs->md.dev_sectors
+ */
+
+ return rs;
+}
+
+static void context_free(struct raid_set *rs)
+{
+ int i;
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (rs->dev[i].meta_dev)
+ dm_put_device(rs->ti, rs->dev[i].meta_dev);
+ md_rdev_clear(&rs->dev[i].rdev);
+ if (rs->dev[i].data_dev)
+ dm_put_device(rs->ti, rs->dev[i].data_dev);
+ }
+
+ kfree(rs);
+}
+
+/*
+ * For every device we have two words
+ * <meta_dev>: meta device name or '-' if missing
+ * <data_dev>: data device name or '-' if missing
+ *
+ * The following are permitted:
+ * - -
+ * - <data_dev>
+ * <meta_dev> <data_dev>
+ *
+ * The following is not allowed:
+ * <meta_dev> -
+ *
+ * This code parses those words. If there is a failure,
+ * the caller must use context_free to unwind the operations.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+ int i;
+ int rebuild = 0;
+ int metadata_available = 0;
+ int ret = 0;
+
+ for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+ rs->dev[i].rdev.raid_disk = i;
+
+ rs->dev[i].meta_dev = NULL;
+ rs->dev[i].data_dev = NULL;
+
+ /*
+ * There are no offsets, since there is a separate device
+ * for data and metadata.
+ */
+ rs->dev[i].rdev.data_offset = 0;
+ rs->dev[i].rdev.mddev = &rs->md;
+
+ if (strcmp(argv[0], "-")) {
+ ret = dm_get_device(rs->ti, argv[0],
+ dm_table_get_mode(rs->ti->table),
+ &rs->dev[i].meta_dev);
+ rs->ti->error = "RAID metadata device lookup failure";
+ if (ret)
+ return ret;
+
+ rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
+ if (!rs->dev[i].rdev.sb_page)
+ return -ENOMEM;
+ }
+
+ if (!strcmp(argv[1], "-")) {
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+ (!rs->dev[i].rdev.recovery_offset)) {
+ rs->ti->error = "Drive designated for rebuild not specified";
+ return -EINVAL;
+ }
+
+ rs->ti->error = "No data device supplied with metadata device";
+ if (rs->dev[i].meta_dev)
+ return -EINVAL;
+
+ continue;
+ }
+
+ ret = dm_get_device(rs->ti, argv[1],
+ dm_table_get_mode(rs->ti->table),
+ &rs->dev[i].data_dev);
+ if (ret) {
+ rs->ti->error = "RAID device lookup failure";
+ return ret;
+ }
+
+ if (rs->dev[i].meta_dev) {
+ metadata_available = 1;
+ rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
+ }
+ rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+ list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+ rebuild++;
+ }
+
+ if (metadata_available) {
+ rs->md.external = 0;
+ rs->md.persistent = 1;
+ rs->md.major_version = 2;
+ } else if (rebuild && !rs->md.recovery_cp) {
+ /*
+ * Without metadata, we will not be able to tell if the array
+ * is in-sync or not - we must assume it is not. Therefore,
+ * it is impossible to rebuild a drive.
+ *
+ * Even if there is metadata, the on-disk information may
+ * indicate that the array is not in-sync and it will then
+ * fail at that time.
+ *
+ * User could specify 'nosync' option if desperate.
+ */
+ DMERR("Unable to rebuild drive while array is not in-sync");
+ rs->ti->error = "RAID device lookup failure";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * validate_region_size
+ * @rs
+ * @region_size: region size in sectors. If 0, pick a size (4MiB default).
+ *
+ * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
+ * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_region_size(struct raid_set *rs, unsigned long region_size)
+{
+ unsigned long min_region_size = rs->ti->len / (1 << 21);
+
+ if (!region_size) {
+ /*
+ * Choose a reasonable default. All figures in sectors.
+ */
+ if (min_region_size > (1 << 13)) {
+ /* If not a power of 2, make it the next power of 2 */
+ if (min_region_size & (min_region_size - 1))
+ region_size = 1 << fls(region_size);
+ DMINFO("Choosing default region size of %lu sectors",
+ region_size);
+ } else {
+ DMINFO("Choosing default region size of 4MiB");
+ region_size = 1 << 13; /* sectors */
+ }
+ } else {
+ /*
+ * Validate user-supplied value.
+ */
+ if (region_size > rs->ti->len) {
+ rs->ti->error = "Supplied region size is too large";
+ return -EINVAL;
+ }
+
+ if (region_size < min_region_size) {
+ DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
+ region_size, min_region_size);
+ rs->ti->error = "Supplied region size is too small";
+ return -EINVAL;
+ }
+
+ if (!is_power_of_2(region_size)) {
+ rs->ti->error = "Region size is not a power of 2";
+ return -EINVAL;
+ }
+
+ if (region_size < rs->md.chunk_sectors) {
+ rs->ti->error = "Region size is smaller than the chunk size";
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Convert sectors to bytes.
+ */
+ rs->md.bitmap_info.chunksize = (region_size << 9);
+
+ return 0;
+}
+
+/*
+ * validate_raid_redundancy
+ * @rs
+ *
+ * Determine if there are enough devices in the array that haven't
+ * failed (or are being rebuilt) to form a usable array.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_raid_redundancy(struct raid_set *rs)
+{
+ unsigned i, rebuild_cnt = 0;
+ unsigned rebuilds_per_group = 0, copies, d;
+ unsigned group_size, last_group_start;
+
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
+ !rs->dev[i].rdev.sb_page)
+ rebuild_cnt++;
+
+ switch (rs->raid_type->level) {
+ case 1:
+ if (rebuild_cnt >= rs->md.raid_disks)
+ goto too_many;
+ break;
+ case 4:
+ case 5:
+ case 6:
+ if (rebuild_cnt > rs->raid_type->parity_devs)
+ goto too_many;
+ break;
+ case 10:
+ copies = raid10_md_layout_to_copies(rs->md.layout);
+ if (rebuild_cnt < copies)
+ break;
+
+ /*
+ * It is possible to have a higher rebuild count for RAID10,
+ * as long as the failed devices occur in different mirror
+ * groups (i.e. different stripes).
+ *
+ * When checking "near" format, make sure no adjacent devices
+ * have failed beyond what can be handled. In addition to the
+ * simple case where the number of devices is a multiple of the
+ * number of copies, we must also handle cases where the number
+ * of devices is not a multiple of the number of copies.
+ * E.g. dev1 dev2 dev3 dev4 dev5
+ * A A B B C
+ * C D D E E
+ */
+ if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
+ for (i = 0; i < rs->md.raid_disks * copies; i++) {
+ if (!(i % copies))
+ rebuilds_per_group = 0;
+ d = i % rs->md.raid_disks;
+ if ((!rs->dev[d].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+ (++rebuilds_per_group >= copies))
+ goto too_many;
+ }
+ break;
+ }
+
+ /*
+ * When checking "far" and "offset" formats, we need to ensure
+ * that the device that holds its copy is not also dead or
+ * being rebuilt. (Note that "far" and "offset" formats only
+ * support two copies right now. These formats also only ever
+ * use the 'use_far_sets' variant.)
+ *
+ * This check is somewhat complicated by the need to account
+ * for arrays that are not a multiple of (far) copies. This
+ * results in the need to treat the last (potentially larger)
+ * set differently.
+ */
+ group_size = (rs->md.raid_disks / copies);
+ last_group_start = (rs->md.raid_disks / group_size) - 1;
+ last_group_start *= group_size;
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (!(i % copies) && !(i > last_group_start))
+ rebuilds_per_group = 0;
+ if ((!rs->dev[i].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
+ (++rebuilds_per_group >= copies))
+ goto too_many;
+ }
+ break;
+ default:
+ if (rebuild_cnt)
+ return -EINVAL;
+ }
+
+ return 0;
+
+too_many:
+ return -EINVAL;
+}
+
+/*
+ * Possible arguments are...
+ * <chunk_size> [optional_args]
+ *
+ * Argument definitions
+ * <chunk_size> The number of sectors per disk that
+ * will form the "stripe"
+ * [[no]sync] Force or prevent recovery of the
+ * entire array
+ * [devices_handle_discard_safely] Allow discards on RAID4/5/6; useful if RAID
+ * member device(s) properly support TRIM/UNMAP
+ * [rebuild <idx>] Rebuild the drive indicated by the index
+ * [daemon_sleep <ms>] Time between bitmap daemon work to
+ * clear bits
+ * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
+ * [write_mostly <idx>] Indicate a write mostly drive via index
+ * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
+ * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
+ * [region_size <sectors>] Defines granularity of bitmap
+ *
+ * RAID10-only options:
+ * [raid10_copies <# copies>] Number of copies. (Default: 2)
+ * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+ unsigned num_raid_params)
+{
+ char *raid10_format = "near";
+ unsigned raid10_copies = 2;
+ unsigned i;
+ unsigned long value, region_size = 0;
+ sector_t sectors_per_dev = rs->ti->len;
+ sector_t max_io_len;
+ char *key;
+
+ /*
+ * First, parse the in-order required arguments
+ * "chunk_size" is the only argument of this type.
+ */
+ if ((kstrtoul(argv[0], 10, &value) < 0)) {
+ rs->ti->error = "Bad chunk size";
+ return -EINVAL;
+ } else if (rs->raid_type->level == 1) {
+ if (value)
+ DMERR("Ignoring chunk size parameter for RAID 1");
+ value = 0;
+ } else if (!is_power_of_2(value)) {
+ rs->ti->error = "Chunk size must be a power of 2";
+ return -EINVAL;
+ } else if (value < 8) {
+ rs->ti->error = "Chunk size value is too small";
+ return -EINVAL;
+ }
+
+ rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+ argv++;
+ num_raid_params--;
+
+ /*
+ * We set each individual device as In_sync with a completed
+ * 'recovery_offset'. If there has been a device failure or
+ * replacement then one of the following cases applies:
+ *
+ * 1) User specifies 'rebuild'.
+ * - Device is reset when param is read.
+ * 2) A new device is supplied.
+ * - No matching superblock found, resets device.
+ * 3) Device failure was transient and returns on reload.
+ * - Failure noticed, resets device for bitmap replay.
+ * 4) Device hadn't completed recovery after previous failure.
+ * - Superblock is read and overrides recovery_offset.
+ *
+ * What is found in the superblocks of the devices is always
+ * authoritative, unless 'rebuild' or '[no]sync' was specified.
+ */
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ set_bit(In_sync, &rs->dev[i].rdev.flags);
+ rs->dev[i].rdev.recovery_offset = MaxSector;
+ }
+
+ /*
+ * Second, parse the unordered optional arguments
+ */
+ for (i = 0; i < num_raid_params; i++) {
+ if (!strcasecmp(argv[i], "nosync")) {
+ rs->md.recovery_cp = MaxSector;
+ rs->print_flags |= DMPF_NOSYNC;
+ continue;
+ }
+ if (!strcasecmp(argv[i], "sync")) {
+ rs->md.recovery_cp = 0;
+ rs->print_flags |= DMPF_SYNC;
+ continue;
+ }
+
+ /* The rest of the optional arguments come in key/value pairs */
+ if ((i + 1) >= num_raid_params) {
+ rs->ti->error = "Wrong number of raid parameters given";
+ return -EINVAL;
+ }
+
+ key = argv[i++];
+
+ /* Parameters that take a string value are checked here. */
+ if (!strcasecmp(key, "raid10_format")) {
+ if (rs->raid_type->level != 10) {
+ rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
+ return -EINVAL;
+ }
+ if (strcmp("near", argv[i]) &&
+ strcmp("far", argv[i]) &&
+ strcmp("offset", argv[i])) {
+ rs->ti->error = "Invalid 'raid10_format' value given";
+ return -EINVAL;
+ }
+ raid10_format = argv[i];
+ rs->print_flags |= DMPF_RAID10_FORMAT;
+ continue;
+ }
+
+ if (kstrtoul(argv[i], 10, &value) < 0) {
+ rs->ti->error = "Bad numerical argument given in raid params";
+ return -EINVAL;
+ }
+
+ /* Parameters that take a numeric value are checked here */
+ if (!strcasecmp(key, "rebuild")) {
+ if (value >= rs->md.raid_disks) {
+ rs->ti->error = "Invalid rebuild index given";
+ return -EINVAL;
+ }
+ clear_bit(In_sync, &rs->dev[value].rdev.flags);
+ rs->dev[value].rdev.recovery_offset = 0;
+ rs->print_flags |= DMPF_REBUILD;
+ } else if (!strcasecmp(key, "write_mostly")) {
+ if (rs->raid_type->level != 1) {
+ rs->ti->error = "write_mostly option is only valid for RAID1";
+ return -EINVAL;
+ }
+ if (value >= rs->md.raid_disks) {
+ rs->ti->error = "Invalid write_mostly drive index given";
+ return -EINVAL;
+ }
+ set_bit(WriteMostly, &rs->dev[value].rdev.flags);
+ } else if (!strcasecmp(key, "max_write_behind")) {
+ if (rs->raid_type->level != 1) {
+ rs->ti->error = "max_write_behind option is only valid for RAID1";
+ return -EINVAL;
+ }
+ rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+
+ /*
+ * In device-mapper, we specify things in sectors, but
+ * MD records this value in kB
+ */
+ value /= 2;
+ if (value > COUNTER_MAX) {
+ rs->ti->error = "Max write-behind limit out of range";
+ return -EINVAL;
+ }
+ rs->md.bitmap_info.max_write_behind = value;
+ } else if (!strcasecmp(key, "daemon_sleep")) {
+ rs->print_flags |= DMPF_DAEMON_SLEEP;
+ if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+ rs->ti->error = "daemon sleep period out of range";
+ return -EINVAL;
+ }
+ rs->md.bitmap_info.daemon_sleep = value;
+ } else if (!strcasecmp(key, "stripe_cache")) {
+ rs->print_flags |= DMPF_STRIPE_CACHE;
+
+ /*
+ * In device-mapper, we specify things in sectors, but
+ * MD records this value in kB
+ */
+ value /= 2;
+
+ if ((rs->raid_type->level != 5) &&
+ (rs->raid_type->level != 6)) {
+ rs->ti->error = "Inappropriate argument: stripe_cache";
+ return -EINVAL;
+ }
+ if (raid5_set_cache_size(&rs->md, (int)value)) {
+ rs->ti->error = "Bad stripe_cache size";
+ return -EINVAL;
+ }
+ } else if (!strcasecmp(key, "min_recovery_rate")) {
+ rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+ if (value > INT_MAX) {
+ rs->ti->error = "min_recovery_rate out of range";
+ return -EINVAL;
+ }
+ rs->md.sync_speed_min = (int)value;
+ } else if (!strcasecmp(key, "max_recovery_rate")) {
+ rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+ if (value > INT_MAX) {
+ rs->ti->error = "max_recovery_rate out of range";
+ return -EINVAL;
+ }
+ rs->md.sync_speed_max = (int)value;
+ } else if (!strcasecmp(key, "region_size")) {
+ rs->print_flags |= DMPF_REGION_SIZE;
+ region_size = value;
+ } else if (!strcasecmp(key, "raid10_copies") &&
+ (rs->raid_type->level == 10)) {
+ if ((value < 2) || (value > 0xFF)) {
+ rs->ti->error = "Bad value for 'raid10_copies'";
+ return -EINVAL;
+ }
+ rs->print_flags |= DMPF_RAID10_COPIES;
+ raid10_copies = value;
+ } else {
+ DMERR("Unable to parse RAID parameter: %s", key);
+ rs->ti->error = "Unable to parse RAID parameters";
+ return -EINVAL;
+ }
+ }
+
+ if (validate_region_size(rs, region_size))
+ return -EINVAL;
+
+ if (rs->md.chunk_sectors)
+ max_io_len = rs->md.chunk_sectors;
+ else
+ max_io_len = region_size;
+
+ if (dm_set_target_max_io_len(rs->ti, max_io_len))
+ return -EINVAL;
+
+ if (rs->raid_type->level == 10) {
+ if (raid10_copies > rs->md.raid_disks) {
+ rs->ti->error = "Not enough devices to satisfy specification";
+ return -EINVAL;
+ }
+
+ /*
+ * If the format is not "near", we only support
+ * two copies at the moment.
+ */
+ if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
+ rs->ti->error = "Too many copies for given RAID10 format.";
+ return -EINVAL;
+ }
+
+ /* (Len * #mirrors) / #devices */
+ sectors_per_dev = rs->ti->len * raid10_copies;
+ sector_div(sectors_per_dev, rs->md.raid_disks);
+
+ rs->md.layout = raid10_format_to_md_layout(raid10_format,
+ raid10_copies);
+ rs->md.new_layout = rs->md.layout;
+ } else if ((rs->raid_type->level > 1) &&
+ sector_div(sectors_per_dev,
+ (rs->md.raid_disks - rs->raid_type->parity_devs))) {
+ rs->ti->error = "Target length not divisible by number of data devices";
+ return -EINVAL;
+ }
+ rs->md.dev_sectors = sectors_per_dev;
+
+ /* Assume there are no metadata devices until the drives are parsed */
+ rs->md.persistent = 0;
+ rs->md.external = 1;
+
+ return 0;
+}
+
+static void do_table_event(struct work_struct *ws)
+{
+ struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
+
+ dm_table_event(rs->ti->table);
+}
+
+static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
+{
+ struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+ return mddev_congested(&rs->md, bits);
+}
+
+/*
+ * This structure is never routinely used by userspace, unlike md superblocks.
+ * Devices with this superblock should only ever be accessed via device-mapper.
+ */
+#define DM_RAID_MAGIC 0x64526D44
+struct dm_raid_superblock {
+ __le32 magic; /* "DmRd" */
+ __le32 features; /* Used to indicate possible future changes */
+
+ __le32 num_devices; /* Number of devices in this array. (Max 64) */
+ __le32 array_position; /* The position of this drive in the array */
+
+ __le64 events; /* Incremented by md when superblock updated */
+ __le64 failed_devices; /* Bit field of devices to indicate failures */
+
+ /*
+ * This offset tracks the progress of the repair or replacement of
+ * an individual drive.
+ */
+ __le64 disk_recovery_offset;
+
+ /*
+ * This offset tracks the progress of the initial array
+ * synchronisation/parity calculation.
+ */
+ __le64 array_resync_offset;
+
+ /*
+ * RAID characteristics
+ */
+ __le32 level;
+ __le32 layout;
+ __le32 stripe_sectors;
+
+ /* Remainder of a logical block is zero-filled when writing (see super_sync()). */
+} __packed;
+
+static int read_disk_sb(struct md_rdev *rdev, int size)
+{
+ BUG_ON(!rdev->sb_page);
+
+ if (rdev->sb_loaded)
+ return 0;
+
+ if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
+ DMERR("Failed to read superblock of device at position %d",
+ rdev->raid_disk);
+ md_error(rdev->mddev, rdev);
+ return -EINVAL;
+ }
+
+ rdev->sb_loaded = 1;
+
+ return 0;
+}
+
+static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
+{
+ int i;
+ uint64_t failed_devices;
+ struct dm_raid_superblock *sb;
+ struct raid_set *rs = container_of(mddev, struct raid_set, md);
+
+ sb = page_address(rdev->sb_page);
+ failed_devices = le64_to_cpu(sb->failed_devices);
+
+ for (i = 0; i < mddev->raid_disks; i++)
+ if (!rs->dev[i].data_dev ||
+ test_bit(Faulty, &(rs->dev[i].rdev.flags)))
+ failed_devices |= (1ULL << i);
+
+ memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
+
+ sb->magic = cpu_to_le32(DM_RAID_MAGIC);
+ sb->features = cpu_to_le32(0); /* No features yet */
+
+ sb->num_devices = cpu_to_le32(mddev->raid_disks);
+ sb->array_position = cpu_to_le32(rdev->raid_disk);
+
+ sb->events = cpu_to_le64(mddev->events);
+ sb->failed_devices = cpu_to_le64(failed_devices);
+
+ sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
+ sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+
+ sb->level = cpu_to_le32(mddev->level);
+ sb->layout = cpu_to_le32(mddev->layout);
+ sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+}
+
+/*
+ * super_load
+ *
+ * This function creates a superblock if one is not found on the device
+ * and will decide which superblock to use if there's a choice.
+ *
+ * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
+ */
+static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
+{
+ int ret;
+ struct dm_raid_superblock *sb;
+ struct dm_raid_superblock *refsb;
+ uint64_t events_sb, events_refsb;
+
+ rdev->sb_start = 0;
+ rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
+ if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) {
+ DMERR("superblock size of a logical block is no longer valid");
+ return -EINVAL;
+ }
+
+ ret = read_disk_sb(rdev, rdev->sb_size);
+ if (ret)
+ return ret;
+
+ sb = page_address(rdev->sb_page);
+
+ /*
+ * Two cases that we want to write new superblocks and rebuild:
+ * 1) New device (no matching magic number)
+ * 2) Device specified for rebuild (!In_sync w/ offset == 0)
+ */
+ if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
+ (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
+ super_sync(rdev->mddev, rdev);
+
+ set_bit(FirstUse, &rdev->flags);
+
+ /* Force writing of superblocks to disk */
+ set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+
+ /* Any superblock is better than none, choose that if given */
+ return refdev ? 0 : 1;
+ }
+
+ if (!refdev)
+ return 1;
+
+ events_sb = le64_to_cpu(sb->events);
+
+ refsb = page_address(refdev->sb_page);
+ events_refsb = le64_to_cpu(refsb->events);
+
+ return (events_sb > events_refsb) ? 1 : 0;
+}
+
+static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
+{
+ int role;
+ struct raid_set *rs = container_of(mddev, struct raid_set, md);
+ uint64_t events_sb;
+ uint64_t failed_devices;
+ struct dm_raid_superblock *sb;
+ uint32_t new_devs = 0;
+ uint32_t rebuilds = 0;
+ struct md_rdev *r;
+ struct dm_raid_superblock *sb2;
+
+ sb = page_address(rdev->sb_page);
+ events_sb = le64_to_cpu(sb->events);
+ failed_devices = le64_to_cpu(sb->failed_devices);
+
+ /*
+ * Initialise to 1 if this is a new superblock.
+ */
+ mddev->events = events_sb ? : 1;
+
+ /*
+ * Reshaping is not currently allowed
+ */
+ if (le32_to_cpu(sb->level) != mddev->level) {
+ DMERR("Reshaping arrays not yet supported. (RAID level change)");
+ return -EINVAL;
+ }
+ if (le32_to_cpu(sb->layout) != mddev->layout) {
+ DMERR("Reshaping arrays not yet supported. (RAID layout change)");
+ DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+ DMERR(" Old layout: %s w/ %d copies",
+ raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
+ raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
+ DMERR(" New layout: %s w/ %d copies",
+ raid10_md_layout_to_format(mddev->layout),
+ raid10_md_layout_to_copies(mddev->layout));
+ return -EINVAL;
+ }
+ if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
+ DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
+ return -EINVAL;
+ }
+
+ /* We can only change the number of devices in RAID1 right now */
+ if ((rs->raid_type->level != 1) &&
+ (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+ DMERR("Reshaping arrays not yet supported. (device count change)");
+ return -EINVAL;
+ }
+
+ if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+ mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+
+ /*
+ * During load, we set FirstUse if a new superblock was written.
+ * There are two reasons we might not have a superblock:
+ * 1) The array is brand new - in which case, all of the
+ * devices must have their In_sync bit set. Also,
+ * recovery_cp must be 0, unless forced.
+ * 2) This is a new device being added to an old array
+ * and the new device needs to be rebuilt - in which
+ * case the In_sync bit will /not/ be set and
+ * recovery_cp must be MaxSector.
+ */
+ rdev_for_each(r, mddev) {
+ if (!test_bit(In_sync, &r->flags)) {
+ DMINFO("Device %d specified for rebuild: "
+ "Clearing superblock", r->raid_disk);
+ rebuilds++;
+ } else if (test_bit(FirstUse, &r->flags))
+ new_devs++;
+ }
+
+ if (!rebuilds) {
+ if (new_devs == mddev->raid_disks) {
+ DMINFO("Superblocks created for new array");
+ set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+ } else if (new_devs) {
+ DMERR("New device injected "
+ "into existing array without 'rebuild' "
+ "parameter specified");
+ return -EINVAL;
+ }
+ } else if (new_devs) {
+ DMERR("'rebuild' devices cannot be "
+ "injected into an array with other first-time devices");
+ return -EINVAL;
+ } else if (mddev->recovery_cp != MaxSector) {
+ DMERR("'rebuild' specified while array is not in-sync");
+ return -EINVAL;
+ }
+
+ /*
+ * Now we set the Faulty bit for those devices that are
+ * recorded in the superblock as failed.
+ */
+ rdev_for_each(r, mddev) {
+ if (!r->sb_page)
+ continue;
+ sb2 = page_address(r->sb_page);
+ sb2->failed_devices = 0;
+
+ /*
+ * Check for any device re-ordering.
+ */
+ if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
+ role = le32_to_cpu(sb2->array_position);
+ if (role != r->raid_disk) {
+ if (rs->raid_type->level != 1) {
+ rs->ti->error = "Cannot change device "
+ "positions in RAID array";
+ return -EINVAL;
+ }
+ DMINFO("RAID1 device #%d now at position #%d",
+ role, r->raid_disk);
+ }
+
+ /*
+ * Partial recovery is performed on
+ * returning failed devices.
+ */
+ if (failed_devices & (1 << role))
+ set_bit(Faulty, &r->flags);
+ }
+ }
+
+ return 0;
+}
+
+static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct dm_raid_superblock *sb = page_address(rdev->sb_page);
+
+ /*
+ * If mddev->events is not set, we know we have not yet initialized
+ * the array.
+ */
+ if (!mddev->events && super_init_validation(mddev, rdev))
+ return -EINVAL;
+
+ mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
+ rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+ if (!test_bit(FirstUse, &rdev->flags)) {
+ rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
+ if (rdev->recovery_offset != MaxSector)
+ clear_bit(In_sync, &rdev->flags);
+ }
+
+ /*
+ * If a device comes back, set it as not In_sync and no longer faulty.
+ */
+ if (test_bit(Faulty, &rdev->flags)) {
+ clear_bit(Faulty, &rdev->flags);
+ clear_bit(In_sync, &rdev->flags);
+ rdev->saved_raid_disk = rdev->raid_disk;
+ rdev->recovery_offset = 0;
+ }
+
+ clear_bit(FirstUse, &rdev->flags);
+
+ return 0;
+}
+
+/*
+ * Analyse superblocks and select the freshest.
+ */
+static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
+{
+ int ret;
+ struct raid_dev *dev;
+ struct md_rdev *rdev, *tmp, *freshest;
+ struct mddev *mddev = &rs->md;
+
+ freshest = NULL;
+ rdev_for_each_safe(rdev, tmp, mddev) {
+ /*
+ * Skipping super_load due to DMPF_SYNC will cause
+ * the array to undergo initialization again as
+ * though it were new. This is the intended effect
+ * of the "sync" directive.
+ *
+ * When reshaping capability is added, we must ensure
+ * that the "sync" directive is disallowed during the
+ * reshape.
+ */
+ if (rs->print_flags & DMPF_SYNC)
+ continue;
+
+ if (!rdev->meta_bdev)
+ continue;
+
+ ret = super_load(rdev, freshest);
+
+ switch (ret) {
+ case 1:
+ freshest = rdev;
+ break;
+ case 0:
+ break;
+ default:
+ dev = container_of(rdev, struct raid_dev, rdev);
+ if (dev->meta_dev)
+ dm_put_device(ti, dev->meta_dev);
+
+ dev->meta_dev = NULL;
+ rdev->meta_bdev = NULL;
+
+ if (rdev->sb_page)
+ put_page(rdev->sb_page);
+
+ rdev->sb_page = NULL;
+
+ rdev->sb_loaded = 0;
+
+ /*
+ * We might be able to salvage the data device
+ * even though the meta device has failed. For
+ * now, we behave as though '- -' had been
+ * set for this device in the table.
+ */
+ if (dev->data_dev)
+ dm_put_device(ti, dev->data_dev);
+
+ dev->data_dev = NULL;
+ rdev->bdev = NULL;
+
+ list_del(&rdev->same_set);
+ }
+ }
+
+ if (!freshest)
+ return 0;
+
+ if (validate_raid_redundancy(rs)) {
+ rs->ti->error = "Insufficient redundancy to activate array";
+ return -EINVAL;
+ }
+
+ /*
+ * Validation of the freshest device provides the source of
+ * validation for the remaining devices.
+ */
+ ti->error = "Unable to assemble array: Invalid superblocks";
+ if (super_validate(mddev, freshest))
+ return -EINVAL;
+
+ rdev_for_each(rdev, mddev)
+ if ((rdev != freshest) && super_validate(mddev, rdev))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Enable/disable discard support on RAID set depending on
+ * RAID level and discard properties of underlying RAID members.
+ */
+static void configure_discard_support(struct dm_target *ti, struct raid_set *rs)
+{
+ int i;
+ bool raid456;
+
+ /* Assume discards not supported until after checks below. */
+ ti->discards_supported = false;
+
+ /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+ raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ struct request_queue *q;
+
+ if (!rs->dev[i].rdev.bdev)
+ continue;
+
+ q = bdev_get_queue(rs->dev[i].rdev.bdev);
+ if (!q || !blk_queue_discard(q))
+ return;
+
+ if (raid456) {
+ if (!q->limits.discard_zeroes_data)
+ return;
+ if (!devices_handle_discard_safely) {
+ DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
+ DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
+ return;
+ }
+ }
+ }
+
+ /* All RAID members properly support discards */
+ ti->discards_supported = true;
+
+ /*
+ * RAID1 and RAID10 personalities require bio splitting,
+ * RAID0/4/5/6 don't and process large discard bios properly.
+ */
+ ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10);
+ ti->num_discard_bios = 1;
+}
+
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ * <raid_type> <#raid_params> <raid_params> \
+ * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int ret;
+ struct raid_type *rt;
+ unsigned long num_raid_params, num_raid_devs;
+ struct raid_set *rs = NULL;
+
+ /* Must have at least <raid_type> <#raid_params> */
+ if (argc < 2) {
+ ti->error = "Too few arguments";
+ return -EINVAL;
+ }
+
+ /* raid type */
+ rt = get_raid_type(argv[0]);
+ if (!rt) {
+ ti->error = "Unrecognised raid_type";
+ return -EINVAL;
+ }
+ argc--;
+ argv++;
+
+ /* number of RAID parameters */
+ if (kstrtoul(argv[0], 10, &num_raid_params) < 0) {
+ ti->error = "Cannot understand number of RAID parameters";
+ return -EINVAL;
+ }
+ argc--;
+ argv++;
+
+ /* Skip over RAID params for now and find out # of devices */
+ if (num_raid_params >= argc) {
+ ti->error = "Arguments do not agree with counts given";
+ return -EINVAL;
+ }
+
+ if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+ (num_raid_devs >= INT_MAX)) {
+ ti->error = "Cannot understand number of raid devices";
+ return -EINVAL;
+ }
+
+ argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+ if (argc != (num_raid_devs * 2)) {
+ ti->error = "Supplied RAID devices does not match the count given";
+ return -EINVAL;
+ }
+
+ rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+ if (IS_ERR(rs))
+ return PTR_ERR(rs);
+
+ ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+ if (ret)
+ goto bad;
+
+ argv += num_raid_params + 1;
+
+ ret = dev_parms(rs, argv);
+ if (ret)
+ goto bad;
+
+ rs->md.sync_super = super_sync;
+ ret = analyse_superblocks(ti, rs);
+ if (ret)
+ goto bad;
+
+ INIT_WORK(&rs->md.event_work, do_table_event);
+ ti->private = rs;
+ ti->num_flush_bios = 1;
+
+ /*
+ * Disable/enable discard support on RAID set.
+ */
+ configure_discard_support(ti, rs);
+
+ mutex_lock(&rs->md.reconfig_mutex);
+ ret = md_run(&rs->md);
+ rs->md.in_sync = 0; /* Assume already marked dirty */
+ mutex_unlock(&rs->md.reconfig_mutex);
+
+ if (ret) {
+ ti->error = "Fail to run raid array";
+ goto bad;
+ }
+
+ if (ti->len != rs->md.array_sectors) {
+ ti->error = "Array size does not match requested target length";
+ ret = -EINVAL;
+ goto size_mismatch;
+ }
+ rs->callbacks.congested_fn = raid_is_congested;
+ dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+
+ mddev_suspend(&rs->md);
+ return 0;
+
+size_mismatch:
+ md_stop(&rs->md);
+bad:
+ context_free(rs);
+
+ return ret;
+}
+
+static void raid_dtr(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ list_del_init(&rs->callbacks.list);
+ md_stop(&rs->md);
+ context_free(rs);
+}
+
+static int raid_map(struct dm_target *ti, struct bio *bio)
+{
+ struct raid_set *rs = ti->private;
+ struct mddev *mddev = &rs->md;
+
+ mddev->pers->make_request(mddev, bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+static const char *decipher_sync_action(struct mddev *mddev)
+{
+ if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ return "frozen";
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ return "reshape";
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ return "resync";
+ else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ return "check";
+ return "repair";
+ }
+
+ if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+ return "recover";
+ }
+
+ return "idle";
+}
+
+static void raid_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct raid_set *rs = ti->private;
+ unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
+ unsigned sz = 0;
+ int i, array_in_sync = 0;
+ sector_t sync;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+
+ if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+ sync = rs->md.curr_resync_completed;
+ else
+ sync = rs->md.recovery_cp;
+
+ if (sync >= rs->md.resync_max_sectors) {
+ /*
+ * Sync complete.
+ */
+ array_in_sync = 1;
+ sync = rs->md.resync_max_sectors;
+ } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+ /*
+ * If "check" or "repair" is occurring, the array has
+ * undergone and initial sync and the health characters
+ * should not be 'a' anymore.
+ */
+ array_in_sync = 1;
+ } else {
+ /*
+ * The array may be doing an initial sync, or it may
+ * be rebuilding individual components. If all the
+ * devices are In_sync, then it is the array that is
+ * being initialized.
+ */
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+ array_in_sync = 1;
+ }
+
+ /*
+ * Status characters:
+ * 'D' = Dead/Failed device
+ * 'a' = Alive but not in-sync
+ * 'A' = Alive and in-sync
+ */
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+ DMEMIT("D");
+ else if (!array_in_sync ||
+ !test_bit(In_sync, &rs->dev[i].rdev.flags))
+ DMEMIT("a");
+ else
+ DMEMIT("A");
+ }
+
+ /*
+ * In-sync ratio:
+ * The in-sync ratio shows the progress of:
+ * - Initializing the array
+ * - Rebuilding a subset of devices of the array
+ * The user can distinguish between the two by referring
+ * to the status characters.
+ */
+ DMEMIT(" %llu/%llu",
+ (unsigned long long) sync,
+ (unsigned long long) rs->md.resync_max_sectors);
+
+ /*
+ * Sync action:
+ * See Documentation/device-mapper/dm-raid.c for
+ * information on each of these states.
+ */
+ DMEMIT(" %s", decipher_sync_action(&rs->md));
+
+ /*
+ * resync_mismatches/mismatch_cnt
+ * This field shows the number of discrepancies found when
+ * performing a "check" of the array.
+ */
+ DMEMIT(" %llu",
+ (strcmp(rs->md.last_sync_action, "check")) ? 0 :
+ (unsigned long long)
+ atomic64_read(&rs->md.resync_mismatches));
+ break;
+ case STATUSTYPE_TABLE:
+ /* The string you would use to construct this array */
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if ((rs->print_flags & DMPF_REBUILD) &&
+ rs->dev[i].data_dev &&
+ !test_bit(In_sync, &rs->dev[i].rdev.flags))
+ raid_param_cnt += 2; /* for rebuilds */
+ if (rs->dev[i].data_dev &&
+ test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+ raid_param_cnt += 2;
+ }
+
+ raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
+ if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+ raid_param_cnt--;
+
+ DMEMIT("%s %u %u", rs->raid_type->name,
+ raid_param_cnt, rs->md.chunk_sectors);
+
+ if ((rs->print_flags & DMPF_SYNC) &&
+ (rs->md.recovery_cp == MaxSector))
+ DMEMIT(" sync");
+ if (rs->print_flags & DMPF_NOSYNC)
+ DMEMIT(" nosync");
+
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if ((rs->print_flags & DMPF_REBUILD) &&
+ rs->dev[i].data_dev &&
+ !test_bit(In_sync, &rs->dev[i].rdev.flags))
+ DMEMIT(" rebuild %u", i);
+
+ if (rs->print_flags & DMPF_DAEMON_SLEEP)
+ DMEMIT(" daemon_sleep %lu",
+ rs->md.bitmap_info.daemon_sleep);
+
+ if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+ DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+
+ if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+ DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev &&
+ test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+ DMEMIT(" write_mostly %u", i);
+
+ if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+ DMEMIT(" max_write_behind %lu",
+ rs->md.bitmap_info.max_write_behind);
+
+ if (rs->print_flags & DMPF_STRIPE_CACHE) {
+ struct r5conf *conf = rs->md.private;
+
+ /* convert from kiB to sectors */
+ DMEMIT(" stripe_cache %d",
+ conf ? conf->max_nr_stripes * 2 : 0);
+ }
+
+ if (rs->print_flags & DMPF_REGION_SIZE)
+ DMEMIT(" region_size %lu",
+ rs->md.bitmap_info.chunksize >> 9);
+
+ if (rs->print_flags & DMPF_RAID10_COPIES)
+ DMEMIT(" raid10_copies %u",
+ raid10_md_layout_to_copies(rs->md.layout));
+
+ if (rs->print_flags & DMPF_RAID10_FORMAT)
+ DMEMIT(" raid10_format %s",
+ raid10_md_layout_to_format(rs->md.layout));
+
+ DMEMIT(" %d", rs->md.raid_disks);
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (rs->dev[i].meta_dev)
+ DMEMIT(" %s", rs->dev[i].meta_dev->name);
+ else
+ DMEMIT(" -");
+
+ if (rs->dev[i].data_dev)
+ DMEMIT(" %s", rs->dev[i].data_dev->name);
+ else
+ DMEMIT(" -");
+ }
+ }
+}
+
+static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct raid_set *rs = ti->private;
+ struct mddev *mddev = &rs->md;
+
+ if (!strcasecmp(argv[0], "reshape")) {
+ DMERR("Reshape not supported.");
+ return -EINVAL;
+ }
+
+ if (!mddev->pers || !mddev->pers->sync_request)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "frozen"))
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ else
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
+ if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_reap_sync_thread(mddev);
+ }
+ } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ return -EBUSY;
+ else if (!strcasecmp(argv[0], "resync"))
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ else if (!strcasecmp(argv[0], "recover")) {
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ } else {
+ if (!strcasecmp(argv[0], "check"))
+ set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ else if (!!strcasecmp(argv[0], "repair"))
+ return -EINVAL;
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ }
+ if (mddev->ro == 2) {
+ /* A write to sync_action is enough to justify
+ * canceling read-auto mode
+ */
+ mddev->ro = 0;
+ if (!mddev->suspended)
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ if (!mddev->suspended)
+ md_wakeup_thread(mddev->thread);
+
+ return 0;
+}
+
+static int raid_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct raid_set *rs = ti->private;
+ unsigned i;
+ int ret = 0;
+
+ for (i = 0; !ret && i < rs->md.raid_disks; i++)
+ if (rs->dev[i].data_dev)
+ ret = fn(ti,
+ rs->dev[i].data_dev,
+ 0, /* No offset on data devs */
+ rs->md.dev_sectors,
+ data);
+
+ return ret;
+}
+
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct raid_set *rs = ti->private;
+ unsigned chunk_size = rs->md.chunk_sectors << 9;
+ struct r5conf *conf = rs->md.private;
+
+ blk_limits_io_min(limits, chunk_size);
+ blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+}
+
+static void raid_presuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ md_stop_writes(&rs->md);
+}
+
+static void raid_postsuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ mddev_suspend(&rs->md);
+}
+
+static void attempt_restore_of_faulty_devices(struct raid_set *rs)
+{
+ int i;
+ uint64_t failed_devices, cleared_failed_devices = 0;
+ unsigned long flags;
+ struct dm_raid_superblock *sb;
+ struct md_rdev *r;
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ r = &rs->dev[i].rdev;
+ if (test_bit(Faulty, &r->flags) && r->sb_page &&
+ sync_page_io(r, 0, r->sb_size, r->sb_page, READ, 1)) {
+ DMINFO("Faulty %s device #%d has readable super block."
+ " Attempting to revive it.",
+ rs->raid_type->name, i);
+
+ /*
+ * Faulty bit may be set, but sometimes the array can
+ * be suspended before the personalities can respond
+ * by removing the device from the array (i.e. calling
+ * 'hot_remove_disk'). If they haven't yet removed
+ * the failed device, its 'raid_disk' number will be
+ * '>= 0' - meaning we must call this function
+ * ourselves.
+ */
+ if ((r->raid_disk >= 0) &&
+ (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0))
+ /* Failed to revive this device, try next */
+ continue;
+
+ r->raid_disk = i;
+ r->saved_raid_disk = i;
+ flags = r->flags;
+ clear_bit(Faulty, &r->flags);
+ clear_bit(WriteErrorSeen, &r->flags);
+ clear_bit(In_sync, &r->flags);
+ if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
+ r->raid_disk = -1;
+ r->saved_raid_disk = -1;
+ r->flags = flags;
+ } else {
+ r->recovery_offset = 0;
+ cleared_failed_devices |= 1 << i;
+ }
+ }
+ }
+ if (cleared_failed_devices) {
+ rdev_for_each(r, &rs->md) {
+ sb = page_address(r->sb_page);
+ failed_devices = le64_to_cpu(sb->failed_devices);
+ failed_devices &= ~cleared_failed_devices;
+ sb->failed_devices = cpu_to_le64(failed_devices);
+ }
+ }
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+ if (!rs->bitmap_loaded) {
+ bitmap_load(&rs->md);
+ rs->bitmap_loaded = 1;
+ } else {
+ /*
+ * A secondary resume while the device is active.
+ * Take this opportunity to check whether any failed
+ * devices are reachable again.
+ */
+ attempt_restore_of_faulty_devices(rs);
+ }
+
+ clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
+ mddev_resume(&rs->md);
+}
+
+static struct target_type raid_target = {
+ .name = "raid",
+ .version = {1, 6, 0},
+ .module = THIS_MODULE,
+ .ctr = raid_ctr,
+ .dtr = raid_dtr,
+ .map = raid_map,
+ .status = raid_status,
+ .message = raid_message,
+ .iterate_devices = raid_iterate_devices,
+ .io_hints = raid_io_hints,
+ .presuspend = raid_presuspend,
+ .postsuspend = raid_postsuspend,
+ .resume = raid_resume,
+};
+
+static int __init dm_raid_init(void)
+{
+ DMINFO("Loading target version %u.%u.%u",
+ raid_target.version[0],
+ raid_target.version[1],
+ raid_target.version[2]);
+ return dm_register_target(&raid_target);
+}
+
+static void __exit dm_raid_exit(void)
+{
+ dm_unregister_target(&raid_target);
+}
+
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+module_param(devices_handle_discard_safely, bool, 0644);
+MODULE_PARM_DESC(devices_handle_discard_safely,
+ "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid1");
+MODULE_ALIAS("dm-raid10");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
+MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
new file mode 100644
index 000000000..089d62751
--- /dev/null
+++ b/drivers/md/dm-raid1.c
@@ -0,0 +1,1458 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-bio-record.h"
+
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/dm-region-hash.h>
+
+#define DM_MSG_PREFIX "raid1"
+
+#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
+
+#define DM_RAID1_HANDLE_ERRORS 0x01
+#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
+
+static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
+
+/*-----------------------------------------------------------------
+ * Mirror set structures.
+ *---------------------------------------------------------------*/
+enum dm_raid1_error {
+ DM_RAID1_WRITE_ERROR,
+ DM_RAID1_FLUSH_ERROR,
+ DM_RAID1_SYNC_ERROR,
+ DM_RAID1_READ_ERROR
+};
+
+struct mirror {
+ struct mirror_set *ms;
+ atomic_t error_count;
+ unsigned long error_type;
+ struct dm_dev *dev;
+ sector_t offset;
+};
+
+struct mirror_set {
+ struct dm_target *ti;
+ struct list_head list;
+
+ uint64_t features;
+
+ spinlock_t lock; /* protects the lists */
+ struct bio_list reads;
+ struct bio_list writes;
+ struct bio_list failures;
+ struct bio_list holds; /* bios are waiting until suspend */
+
+ struct dm_region_hash *rh;
+ struct dm_kcopyd_client *kcopyd_client;
+ struct dm_io_client *io_client;
+
+ /* recovery */
+ region_t nr_regions;
+ int in_sync;
+ int log_failure;
+ int leg_failure;
+ atomic_t suspend;
+
+ atomic_t default_mirror; /* Default mirror */
+
+ struct workqueue_struct *kmirrord_wq;
+ struct work_struct kmirrord_work;
+ struct timer_list timer;
+ unsigned long timer_pending;
+
+ struct work_struct trigger_event;
+
+ unsigned nr_mirrors;
+ struct mirror mirror[0];
+};
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle,
+ "A percentage of time allocated for raid resynchronization");
+
+static void wakeup_mirrord(void *context)
+{
+ struct mirror_set *ms = context;
+
+ queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
+}
+
+static void delayed_wake_fn(unsigned long data)
+{
+ struct mirror_set *ms = (struct mirror_set *) data;
+
+ clear_bit(0, &ms->timer_pending);
+ wakeup_mirrord(ms);
+}
+
+static void delayed_wake(struct mirror_set *ms)
+{
+ if (test_and_set_bit(0, &ms->timer_pending))
+ return;
+
+ ms->timer.expires = jiffies + HZ / 5;
+ ms->timer.data = (unsigned long) ms;
+ ms->timer.function = delayed_wake_fn;
+ add_timer(&ms->timer);
+}
+
+static void wakeup_all_recovery_waiters(void *context)
+{
+ wake_up_all(&_kmirrord_recovery_stopped);
+}
+
+static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
+{
+ unsigned long flags;
+ int should_wake = 0;
+ struct bio_list *bl;
+
+ bl = (rw == WRITE) ? &ms->writes : &ms->reads;
+ spin_lock_irqsave(&ms->lock, flags);
+ should_wake = !(bl->head);
+ bio_list_add(bl, bio);
+ spin_unlock_irqrestore(&ms->lock, flags);
+
+ if (should_wake)
+ wakeup_mirrord(ms);
+}
+
+static void dispatch_bios(void *context, struct bio_list *bio_list)
+{
+ struct mirror_set *ms = context;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(bio_list)))
+ queue_bio(ms, bio, WRITE);
+}
+
+struct dm_raid1_bio_record {
+ struct mirror *m;
+ /* if details->bi_bdev == NULL, details were not saved */
+ struct dm_bio_details details;
+ region_t write_region;
+};
+
+/*
+ * Every mirror should look like this one.
+ */
+#define DEFAULT_MIRROR 0
+
+/*
+ * This is yucky. We squirrel the mirror struct away inside
+ * bi_next for read/write buffers. This is safe since the bh
+ * doesn't get submitted to the lower levels of block layer.
+ */
+static struct mirror *bio_get_m(struct bio *bio)
+{
+ return (struct mirror *) bio->bi_next;
+}
+
+static void bio_set_m(struct bio *bio, struct mirror *m)
+{
+ bio->bi_next = (struct bio *) m;
+}
+
+static struct mirror *get_default_mirror(struct mirror_set *ms)
+{
+ return &ms->mirror[atomic_read(&ms->default_mirror)];
+}
+
+static void set_default_mirror(struct mirror *m)
+{
+ struct mirror_set *ms = m->ms;
+ struct mirror *m0 = &(ms->mirror[0]);
+
+ atomic_set(&ms->default_mirror, m - m0);
+}
+
+static struct mirror *get_valid_mirror(struct mirror_set *ms)
+{
+ struct mirror *m;
+
+ for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
+ if (!atomic_read(&m->error_count))
+ return m;
+
+ return NULL;
+}
+
+/* fail_mirror
+ * @m: mirror device to fail
+ * @error_type: one of the enum's, DM_RAID1_*_ERROR
+ *
+ * If errors are being handled, record the type of
+ * error encountered for this device. If this type
+ * of error has already been recorded, we can return;
+ * otherwise, we must signal userspace by triggering
+ * an event. Additionally, if the device is the
+ * primary device, we must choose a new primary, but
+ * only if the mirror is in-sync.
+ *
+ * This function must not block.
+ */
+static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
+{
+ struct mirror_set *ms = m->ms;
+ struct mirror *new;
+
+ ms->leg_failure = 1;
+
+ /*
+ * error_count is used for nothing more than a
+ * simple way to tell if a device has encountered
+ * errors.
+ */
+ atomic_inc(&m->error_count);
+
+ if (test_and_set_bit(error_type, &m->error_type))
+ return;
+
+ if (!errors_handled(ms))
+ return;
+
+ if (m != get_default_mirror(ms))
+ goto out;
+
+ if (!ms->in_sync) {
+ /*
+ * Better to issue requests to same failing device
+ * than to risk returning corrupt data.
+ */
+ DMERR("Primary mirror (%s) failed while out-of-sync: "
+ "Reads may fail.", m->dev->name);
+ goto out;
+ }
+
+ new = get_valid_mirror(ms);
+ if (new)
+ set_default_mirror(new);
+ else
+ DMWARN("All sides of mirror have failed.");
+
+out:
+ schedule_work(&ms->trigger_event);
+}
+
+static int mirror_flush(struct dm_target *ti)
+{
+ struct mirror_set *ms = ti->private;
+ unsigned long error_bits;
+
+ unsigned int i;
+ struct dm_io_region io[ms->nr_mirrors];
+ struct mirror *m;
+ struct dm_io_request io_req = {
+ .bi_rw = WRITE_FLUSH,
+ .mem.type = DM_IO_KMEM,
+ .mem.ptr.addr = NULL,
+ .client = ms->io_client,
+ };
+
+ for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
+ io[i].bdev = m->dev->bdev;
+ io[i].sector = 0;
+ io[i].count = 0;
+ }
+
+ error_bits = -1;
+ dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
+ if (unlikely(error_bits != 0)) {
+ for (i = 0; i < ms->nr_mirrors; i++)
+ if (test_bit(i, &error_bits))
+ fail_mirror(ms->mirror + i,
+ DM_RAID1_FLUSH_ERROR);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*-----------------------------------------------------------------
+ * Recovery.
+ *
+ * When a mirror is first activated we may find that some regions
+ * are in the no-sync state. We have to recover these by
+ * recopying from the default mirror to all the others.
+ *---------------------------------------------------------------*/
+static void recovery_complete(int read_err, unsigned long write_err,
+ void *context)
+{
+ struct dm_region *reg = context;
+ struct mirror_set *ms = dm_rh_region_context(reg);
+ int m, bit = 0;
+
+ if (read_err) {
+ /* Read error means the failure of default mirror. */
+ DMERR_LIMIT("Unable to read primary mirror during recovery");
+ fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR);
+ }
+
+ if (write_err) {
+ DMERR_LIMIT("Write error during recovery (error = 0x%lx)",
+ write_err);
+ /*
+ * Bits correspond to devices (excluding default mirror).
+ * The default mirror cannot change during recovery.
+ */
+ for (m = 0; m < ms->nr_mirrors; m++) {
+ if (&ms->mirror[m] == get_default_mirror(ms))
+ continue;
+ if (test_bit(bit, &write_err))
+ fail_mirror(ms->mirror + m,
+ DM_RAID1_SYNC_ERROR);
+ bit++;
+ }
+ }
+
+ dm_rh_recovery_end(reg, !(read_err || write_err));
+}
+
+static int recover(struct mirror_set *ms, struct dm_region *reg)
+{
+ int r;
+ unsigned i;
+ struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest;
+ struct mirror *m;
+ unsigned long flags = 0;
+ region_t key = dm_rh_get_region_key(reg);
+ sector_t region_size = dm_rh_get_region_size(ms->rh);
+
+ /* fill in the source */
+ m = get_default_mirror(ms);
+ from.bdev = m->dev->bdev;
+ from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key);
+ if (key == (ms->nr_regions - 1)) {
+ /*
+ * The final region may be smaller than
+ * region_size.
+ */
+ from.count = ms->ti->len & (region_size - 1);
+ if (!from.count)
+ from.count = region_size;
+ } else
+ from.count = region_size;
+
+ /* fill in the destinations */
+ for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
+ if (&ms->mirror[i] == get_default_mirror(ms))
+ continue;
+
+ m = ms->mirror + i;
+ dest->bdev = m->dev->bdev;
+ dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key);
+ dest->count = from.count;
+ dest++;
+ }
+
+ /* hand to kcopyd */
+ if (!errors_handled(ms))
+ set_bit(DM_KCOPYD_IGNORE_ERROR, &flags);
+
+ r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
+ flags, recovery_complete, reg);
+
+ return r;
+}
+
+static void do_recovery(struct mirror_set *ms)
+{
+ struct dm_region *reg;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+ int r;
+
+ /*
+ * Start quiescing some regions.
+ */
+ dm_rh_recovery_prepare(ms->rh);
+
+ /*
+ * Copy any already quiesced regions.
+ */
+ while ((reg = dm_rh_recovery_start(ms->rh))) {
+ r = recover(ms, reg);
+ if (r)
+ dm_rh_recovery_end(reg, 0);
+ }
+
+ /*
+ * Update the in sync flag.
+ */
+ if (!ms->in_sync &&
+ (log->type->get_sync_count(log) == ms->nr_regions)) {
+ /* the sync is complete */
+ dm_table_event(ms->ti->table);
+ ms->in_sync = 1;
+ }
+}
+
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
+{
+ struct mirror *m = get_default_mirror(ms);
+
+ do {
+ if (likely(!atomic_read(&m->error_count)))
+ return m;
+
+ if (m-- == ms->mirror)
+ m += ms->nr_mirrors;
+ } while (m != get_default_mirror(ms));
+
+ return NULL;
+}
+
+static int default_ok(struct mirror *m)
+{
+ struct mirror *default_mirror = get_default_mirror(m->ms);
+
+ return !atomic_read(&default_mirror->error_count);
+}
+
+static int mirror_available(struct mirror_set *ms, struct bio *bio)
+{
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+ region_t region = dm_rh_bio_to_region(ms->rh, bio);
+
+ if (log->type->in_sync(log, region, 0))
+ return choose_mirror(ms, bio->bi_iter.bi_sector) ? 1 : 0;
+
+ return 0;
+}
+
+/*
+ * remap a buffer to a particular mirror.
+ */
+static sector_t map_sector(struct mirror *m, struct bio *bio)
+{
+ if (unlikely(!bio->bi_iter.bi_size))
+ return 0;
+ return m->offset + dm_target_offset(m->ms->ti, bio->bi_iter.bi_sector);
+}
+
+static void map_bio(struct mirror *m, struct bio *bio)
+{
+ bio->bi_bdev = m->dev->bdev;
+ bio->bi_iter.bi_sector = map_sector(m, bio);
+}
+
+static void map_region(struct dm_io_region *io, struct mirror *m,
+ struct bio *bio)
+{
+ io->bdev = m->dev->bdev;
+ io->sector = map_sector(m, bio);
+ io->count = bio_sectors(bio);
+}
+
+static void hold_bio(struct mirror_set *ms, struct bio *bio)
+{
+ /*
+ * Lock is required to avoid race condition during suspend
+ * process.
+ */
+ spin_lock_irq(&ms->lock);
+
+ if (atomic_read(&ms->suspend)) {
+ spin_unlock_irq(&ms->lock);
+
+ /*
+ * If device is suspended, complete the bio.
+ */
+ if (dm_noflush_suspending(ms->ti))
+ bio_endio(bio, DM_ENDIO_REQUEUE);
+ else
+ bio_endio(bio, -EIO);
+ return;
+ }
+
+ /*
+ * Hold bio until the suspend is complete.
+ */
+ bio_list_add(&ms->holds, bio);
+ spin_unlock_irq(&ms->lock);
+}
+
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static void read_callback(unsigned long error, void *context)
+{
+ struct bio *bio = context;
+ struct mirror *m;
+
+ m = bio_get_m(bio);
+ bio_set_m(bio, NULL);
+
+ if (likely(!error)) {
+ bio_endio(bio, 0);
+ return;
+ }
+
+ fail_mirror(m, DM_RAID1_READ_ERROR);
+
+ if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
+ DMWARN_LIMIT("Read failure on mirror device %s. "
+ "Trying alternative device.",
+ m->dev->name);
+ queue_bio(m->ms, bio, bio_rw(bio));
+ return;
+ }
+
+ DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
+ m->dev->name);
+ bio_endio(bio, -EIO);
+}
+
+/* Asynchronous read. */
+static void read_async_bio(struct mirror *m, struct bio *bio)
+{
+ struct dm_io_region io;
+ struct dm_io_request io_req = {
+ .bi_rw = READ,
+ .mem.type = DM_IO_BIO,
+ .mem.ptr.bio = bio,
+ .notify.fn = read_callback,
+ .notify.context = bio,
+ .client = m->ms->io_client,
+ };
+
+ map_region(&io, m, bio);
+ bio_set_m(bio, m);
+ BUG_ON(dm_io(&io_req, 1, &io, NULL));
+}
+
+static inline int region_in_sync(struct mirror_set *ms, region_t region,
+ int may_block)
+{
+ int state = dm_rh_get_state(ms->rh, region, may_block);
+ return state == DM_RH_CLEAN || state == DM_RH_DIRTY;
+}
+
+static void do_reads(struct mirror_set *ms, struct bio_list *reads)
+{
+ region_t region;
+ struct bio *bio;
+ struct mirror *m;
+
+ while ((bio = bio_list_pop(reads))) {
+ region = dm_rh_bio_to_region(ms->rh, bio);
+ m = get_default_mirror(ms);
+
+ /*
+ * We can only read balance if the region is in sync.
+ */
+ if (likely(region_in_sync(ms, region, 1)))
+ m = choose_mirror(ms, bio->bi_iter.bi_sector);
+ else if (m && atomic_read(&m->error_count))
+ m = NULL;
+
+ if (likely(m))
+ read_async_bio(m, bio);
+ else
+ bio_endio(bio, -EIO);
+ }
+}
+
+/*-----------------------------------------------------------------
+ * Writes.
+ *
+ * We do different things with the write io depending on the
+ * state of the region that it's in:
+ *
+ * SYNC: increment pending, use kcopyd to write to *all* mirrors
+ * RECOVERING: delay the io until recovery completes
+ * NOSYNC: increment pending, just write to the default mirror
+ *---------------------------------------------------------------*/
+
+
+static void write_callback(unsigned long error, void *context)
+{
+ unsigned i, ret = 0;
+ struct bio *bio = (struct bio *) context;
+ struct mirror_set *ms;
+ int should_wake = 0;
+ unsigned long flags;
+
+ ms = bio_get_m(bio)->ms;
+ bio_set_m(bio, NULL);
+
+ /*
+ * NOTE: We don't decrement the pending count here,
+ * instead it is done by the targets endio function.
+ * This way we handle both writes to SYNC and NOSYNC
+ * regions with the same code.
+ */
+ if (likely(!error)) {
+ bio_endio(bio, ret);
+ return;
+ }
+
+ /*
+ * If the bio is discard, return an error, but do not
+ * degrade the array.
+ */
+ if (bio->bi_rw & REQ_DISCARD) {
+ bio_endio(bio, -EOPNOTSUPP);
+ return;
+ }
+
+ for (i = 0; i < ms->nr_mirrors; i++)
+ if (test_bit(i, &error))
+ fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
+
+ /*
+ * Need to raise event. Since raising
+ * events can block, we need to do it in
+ * the main thread.
+ */
+ spin_lock_irqsave(&ms->lock, flags);
+ if (!ms->failures.head)
+ should_wake = 1;
+ bio_list_add(&ms->failures, bio);
+ spin_unlock_irqrestore(&ms->lock, flags);
+ if (should_wake)
+ wakeup_mirrord(ms);
+}
+
+static void do_write(struct mirror_set *ms, struct bio *bio)
+{
+ unsigned int i;
+ struct dm_io_region io[ms->nr_mirrors], *dest = io;
+ struct mirror *m;
+ struct dm_io_request io_req = {
+ .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
+ .mem.type = DM_IO_BIO,
+ .mem.ptr.bio = bio,
+ .notify.fn = write_callback,
+ .notify.context = bio,
+ .client = ms->io_client,
+ };
+
+ if (bio->bi_rw & REQ_DISCARD) {
+ io_req.bi_rw |= REQ_DISCARD;
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = NULL;
+ }
+
+ for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
+ map_region(dest++, m, bio);
+
+ /*
+ * Use default mirror because we only need it to retrieve the reference
+ * to the mirror set in write_callback().
+ */
+ bio_set_m(bio, get_default_mirror(ms));
+
+ BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL));
+}
+
+static void do_writes(struct mirror_set *ms, struct bio_list *writes)
+{
+ int state;
+ struct bio *bio;
+ struct bio_list sync, nosync, recover, *this_list = NULL;
+ struct bio_list requeue;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+ region_t region;
+
+ if (!writes->head)
+ return;
+
+ /*
+ * Classify each write.
+ */
+ bio_list_init(&sync);
+ bio_list_init(&nosync);
+ bio_list_init(&recover);
+ bio_list_init(&requeue);
+
+ while ((bio = bio_list_pop(writes))) {
+ if ((bio->bi_rw & REQ_FLUSH) ||
+ (bio->bi_rw & REQ_DISCARD)) {
+ bio_list_add(&sync, bio);
+ continue;
+ }
+
+ region = dm_rh_bio_to_region(ms->rh, bio);
+
+ if (log->type->is_remote_recovering &&
+ log->type->is_remote_recovering(log, region)) {
+ bio_list_add(&requeue, bio);
+ continue;
+ }
+
+ state = dm_rh_get_state(ms->rh, region, 1);
+ switch (state) {
+ case DM_RH_CLEAN:
+ case DM_RH_DIRTY:
+ this_list = &sync;
+ break;
+
+ case DM_RH_NOSYNC:
+ this_list = &nosync;
+ break;
+
+ case DM_RH_RECOVERING:
+ this_list = &recover;
+ break;
+ }
+
+ bio_list_add(this_list, bio);
+ }
+
+ /*
+ * Add bios that are delayed due to remote recovery
+ * back on to the write queue
+ */
+ if (unlikely(requeue.head)) {
+ spin_lock_irq(&ms->lock);
+ bio_list_merge(&ms->writes, &requeue);
+ spin_unlock_irq(&ms->lock);
+ delayed_wake(ms);
+ }
+
+ /*
+ * Increment the pending counts for any regions that will
+ * be written to (writes to recover regions are going to
+ * be delayed).
+ */
+ dm_rh_inc_pending(ms->rh, &sync);
+ dm_rh_inc_pending(ms->rh, &nosync);
+
+ /*
+ * If the flush fails on a previous call and succeeds here,
+ * we must not reset the log_failure variable. We need
+ * userspace interaction to do that.
+ */
+ ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure;
+
+ /*
+ * Dispatch io.
+ */
+ if (unlikely(ms->log_failure) && errors_handled(ms)) {
+ spin_lock_irq(&ms->lock);
+ bio_list_merge(&ms->failures, &sync);
+ spin_unlock_irq(&ms->lock);
+ wakeup_mirrord(ms);
+ } else
+ while ((bio = bio_list_pop(&sync)))
+ do_write(ms, bio);
+
+ while ((bio = bio_list_pop(&recover)))
+ dm_rh_delay(ms->rh, bio);
+
+ while ((bio = bio_list_pop(&nosync))) {
+ if (unlikely(ms->leg_failure) && errors_handled(ms)) {
+ spin_lock_irq(&ms->lock);
+ bio_list_add(&ms->failures, bio);
+ spin_unlock_irq(&ms->lock);
+ wakeup_mirrord(ms);
+ } else {
+ map_bio(get_default_mirror(ms), bio);
+ generic_make_request(bio);
+ }
+ }
+}
+
+static void do_failures(struct mirror_set *ms, struct bio_list *failures)
+{
+ struct bio *bio;
+
+ if (likely(!failures->head))
+ return;
+
+ /*
+ * If the log has failed, unattempted writes are being
+ * put on the holds list. We can't issue those writes
+ * until a log has been marked, so we must store them.
+ *
+ * If a 'noflush' suspend is in progress, we can requeue
+ * the I/O's to the core. This give userspace a chance
+ * to reconfigure the mirror, at which point the core
+ * will reissue the writes. If the 'noflush' flag is
+ * not set, we have no choice but to return errors.
+ *
+ * Some writes on the failures list may have been
+ * submitted before the log failure and represent a
+ * failure to write to one of the devices. It is ok
+ * for us to treat them the same and requeue them
+ * as well.
+ */
+ while ((bio = bio_list_pop(failures))) {
+ if (!ms->log_failure) {
+ ms->in_sync = 0;
+ dm_rh_mark_nosync(ms->rh, bio);
+ }
+
+ /*
+ * If all the legs are dead, fail the I/O.
+ * If we have been told to handle errors, hold the bio
+ * and wait for userspace to deal with the problem.
+ * Otherwise pretend that the I/O succeeded. (This would
+ * be wrong if the failed leg returned after reboot and
+ * got replicated back to the good legs.)
+ */
+ if (!get_valid_mirror(ms))
+ bio_endio(bio, -EIO);
+ else if (errors_handled(ms))
+ hold_bio(ms, bio);
+ else
+ bio_endio(bio, 0);
+ }
+}
+
+static void trigger_event(struct work_struct *work)
+{
+ struct mirror_set *ms =
+ container_of(work, struct mirror_set, trigger_event);
+
+ dm_table_event(ms->ti->table);
+}
+
+/*-----------------------------------------------------------------
+ * kmirrord
+ *---------------------------------------------------------------*/
+static void do_mirror(struct work_struct *work)
+{
+ struct mirror_set *ms = container_of(work, struct mirror_set,
+ kmirrord_work);
+ struct bio_list reads, writes, failures;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ms->lock, flags);
+ reads = ms->reads;
+ writes = ms->writes;
+ failures = ms->failures;
+ bio_list_init(&ms->reads);
+ bio_list_init(&ms->writes);
+ bio_list_init(&ms->failures);
+ spin_unlock_irqrestore(&ms->lock, flags);
+
+ dm_rh_update_states(ms->rh, errors_handled(ms));
+ do_recovery(ms);
+ do_reads(ms, &reads);
+ do_writes(ms, &writes);
+ do_failures(ms, &failures);
+}
+
+/*-----------------------------------------------------------------
+ * Target functions
+ *---------------------------------------------------------------*/
+static struct mirror_set *alloc_context(unsigned int nr_mirrors,
+ uint32_t region_size,
+ struct dm_target *ti,
+ struct dm_dirty_log *dl)
+{
+ size_t len;
+ struct mirror_set *ms = NULL;
+
+ len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
+
+ ms = kzalloc(len, GFP_KERNEL);
+ if (!ms) {
+ ti->error = "Cannot allocate mirror context";
+ return NULL;
+ }
+
+ spin_lock_init(&ms->lock);
+ bio_list_init(&ms->reads);
+ bio_list_init(&ms->writes);
+ bio_list_init(&ms->failures);
+ bio_list_init(&ms->holds);
+
+ ms->ti = ti;
+ ms->nr_mirrors = nr_mirrors;
+ ms->nr_regions = dm_sector_div_up(ti->len, region_size);
+ ms->in_sync = 0;
+ ms->log_failure = 0;
+ ms->leg_failure = 0;
+ atomic_set(&ms->suspend, 0);
+ atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
+
+ ms->io_client = dm_io_client_create();
+ if (IS_ERR(ms->io_client)) {
+ ti->error = "Error creating dm_io client";
+ kfree(ms);
+ return NULL;
+ }
+
+ ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord,
+ wakeup_all_recovery_waiters,
+ ms->ti->begin, MAX_RECOVERY,
+ dl, region_size, ms->nr_regions);
+ if (IS_ERR(ms->rh)) {
+ ti->error = "Error creating dirty region hash";
+ dm_io_client_destroy(ms->io_client);
+ kfree(ms);
+ return NULL;
+ }
+
+ return ms;
+}
+
+static void free_context(struct mirror_set *ms, struct dm_target *ti,
+ unsigned int m)
+{
+ while (m--)
+ dm_put_device(ti, ms->mirror[m].dev);
+
+ dm_io_client_destroy(ms->io_client);
+ dm_region_hash_destroy(ms->rh);
+ kfree(ms);
+}
+
+static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
+ unsigned int mirror, char **argv)
+{
+ unsigned long long offset;
+ char dummy;
+
+ if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
+ ti->error = "Invalid offset";
+ return -EINVAL;
+ }
+
+ if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ &ms->mirror[mirror].dev)) {
+ ti->error = "Device lookup failure";
+ return -ENXIO;
+ }
+
+ ms->mirror[mirror].ms = ms;
+ atomic_set(&(ms->mirror[mirror].error_count), 0);
+ ms->mirror[mirror].error_type = 0;
+ ms->mirror[mirror].offset = offset;
+
+ return 0;
+}
+
+/*
+ * Create dirty log: log_type #log_params <log_params>
+ */
+static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
+ unsigned argc, char **argv,
+ unsigned *args_used)
+{
+ unsigned param_count;
+ struct dm_dirty_log *dl;
+ char dummy;
+
+ if (argc < 2) {
+ ti->error = "Insufficient mirror log arguments";
+ return NULL;
+ }
+
+ if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) {
+ ti->error = "Invalid mirror log argument count";
+ return NULL;
+ }
+
+ *args_used = 2 + param_count;
+
+ if (argc < *args_used) {
+ ti->error = "Insufficient mirror log arguments";
+ return NULL;
+ }
+
+ dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
+ argv + 2);
+ if (!dl) {
+ ti->error = "Error creating mirror dirty log";
+ return NULL;
+ }
+
+ return dl;
+}
+
+static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
+ unsigned *args_used)
+{
+ unsigned num_features;
+ struct dm_target *ti = ms->ti;
+ char dummy;
+
+ *args_used = 0;
+
+ if (!argc)
+ return 0;
+
+ if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) {
+ ti->error = "Invalid number of features";
+ return -EINVAL;
+ }
+
+ argc--;
+ argv++;
+ (*args_used)++;
+
+ if (num_features > argc) {
+ ti->error = "Not enough arguments to support feature count";
+ return -EINVAL;
+ }
+
+ if (!strcmp("handle_errors", argv[0]))
+ ms->features |= DM_RAID1_HANDLE_ERRORS;
+ else {
+ ti->error = "Unrecognised feature requested";
+ return -EINVAL;
+ }
+
+ (*args_used)++;
+
+ return 0;
+}
+
+/*
+ * Construct a mirror mapping:
+ *
+ * log_type #log_params <log_params>
+ * #mirrors [mirror_path offset]{2,}
+ * [#features <features>]
+ *
+ * log_type is "core" or "disk"
+ * #log_params is between 1 and 3
+ *
+ * If present, features must be "handle_errors".
+ */
+static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ int r;
+ unsigned int nr_mirrors, m, args_used;
+ struct mirror_set *ms;
+ struct dm_dirty_log *dl;
+ char dummy;
+
+ dl = create_dirty_log(ti, argc, argv, &args_used);
+ if (!dl)
+ return -EINVAL;
+
+ argv += args_used;
+ argc -= args_used;
+
+ if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
+ nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
+ ti->error = "Invalid number of mirrors";
+ dm_dirty_log_destroy(dl);
+ return -EINVAL;
+ }
+
+ argv++, argc--;
+
+ if (argc < nr_mirrors * 2) {
+ ti->error = "Too few mirror arguments";
+ dm_dirty_log_destroy(dl);
+ return -EINVAL;
+ }
+
+ ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
+ if (!ms) {
+ dm_dirty_log_destroy(dl);
+ return -ENOMEM;
+ }
+
+ /* Get the mirror parameter sets */
+ for (m = 0; m < nr_mirrors; m++) {
+ r = get_mirror(ms, ti, m, argv);
+ if (r) {
+ free_context(ms, ti, m);
+ return r;
+ }
+ argv += 2;
+ argc -= 2;
+ }
+
+ ti->private = ms;
+
+ r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh));
+ if (r)
+ goto err_free_context;
+
+ ti->num_flush_bios = 1;
+ ti->num_discard_bios = 1;
+ ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
+ ti->discard_zeroes_data_unsupported = true;
+
+ ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
+ if (!ms->kmirrord_wq) {
+ DMERR("couldn't start kmirrord");
+ r = -ENOMEM;
+ goto err_free_context;
+ }
+ INIT_WORK(&ms->kmirrord_work, do_mirror);
+ init_timer(&ms->timer);
+ ms->timer_pending = 0;
+ INIT_WORK(&ms->trigger_event, trigger_event);
+
+ r = parse_features(ms, argc, argv, &args_used);
+ if (r)
+ goto err_destroy_wq;
+
+ argv += args_used;
+ argc -= args_used;
+
+ /*
+ * Any read-balancing addition depends on the
+ * DM_RAID1_HANDLE_ERRORS flag being present.
+ * This is because the decision to balance depends
+ * on the sync state of a region. If the above
+ * flag is not present, we ignore errors; and
+ * the sync state may be inaccurate.
+ */
+
+ if (argc) {
+ ti->error = "Too many mirror arguments";
+ r = -EINVAL;
+ goto err_destroy_wq;
+ }
+
+ ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+ if (IS_ERR(ms->kcopyd_client)) {
+ r = PTR_ERR(ms->kcopyd_client);
+ goto err_destroy_wq;
+ }
+
+ wakeup_mirrord(ms);
+ return 0;
+
+err_destroy_wq:
+ destroy_workqueue(ms->kmirrord_wq);
+err_free_context:
+ free_context(ms, ti, ms->nr_mirrors);
+ return r;
+}
+
+static void mirror_dtr(struct dm_target *ti)
+{
+ struct mirror_set *ms = (struct mirror_set *) ti->private;
+
+ del_timer_sync(&ms->timer);
+ flush_workqueue(ms->kmirrord_wq);
+ flush_work(&ms->trigger_event);
+ dm_kcopyd_client_destroy(ms->kcopyd_client);
+ destroy_workqueue(ms->kmirrord_wq);
+ free_context(ms, ti, ms->nr_mirrors);
+}
+
+/*
+ * Mirror mapping function
+ */
+static int mirror_map(struct dm_target *ti, struct bio *bio)
+{
+ int r, rw = bio_rw(bio);
+ struct mirror *m;
+ struct mirror_set *ms = ti->private;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+ struct dm_raid1_bio_record *bio_record =
+ dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
+
+ bio_record->details.bi_bdev = NULL;
+
+ if (rw == WRITE) {
+ /* Save region for mirror_end_io() handler */
+ bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio);
+ queue_bio(ms, bio, rw);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0);
+ if (r < 0 && r != -EWOULDBLOCK)
+ return r;
+
+ /*
+ * If region is not in-sync queue the bio.
+ */
+ if (!r || (r == -EWOULDBLOCK)) {
+ if (rw == READA)
+ return -EWOULDBLOCK;
+
+ queue_bio(ms, bio, rw);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /*
+ * The region is in-sync and we can perform reads directly.
+ * Store enough information so we can retry if it fails.
+ */
+ m = choose_mirror(ms, bio->bi_iter.bi_sector);
+ if (unlikely(!m))
+ return -EIO;
+
+ dm_bio_record(&bio_record->details, bio);
+ bio_record->m = m;
+
+ map_bio(m, bio);
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+ int rw = bio_rw(bio);
+ struct mirror_set *ms = (struct mirror_set *) ti->private;
+ struct mirror *m = NULL;
+ struct dm_bio_details *bd = NULL;
+ struct dm_raid1_bio_record *bio_record =
+ dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
+
+ /*
+ * We need to dec pending if this was a write.
+ */
+ if (rw == WRITE) {
+ if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
+ dm_rh_dec(ms->rh, bio_record->write_region);
+ return error;
+ }
+
+ if (error == -EOPNOTSUPP)
+ goto out;
+
+ if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD))
+ goto out;
+
+ if (unlikely(error)) {
+ if (!bio_record->details.bi_bdev) {
+ /*
+ * There wasn't enough memory to record necessary
+ * information for a retry or there was no other
+ * mirror in-sync.
+ */
+ DMERR_LIMIT("Mirror read failed.");
+ return -EIO;
+ }
+
+ m = bio_record->m;
+
+ DMERR("Mirror read failed from %s. Trying alternative device.",
+ m->dev->name);
+
+ fail_mirror(m, DM_RAID1_READ_ERROR);
+
+ /*
+ * A failed read is requeued for another attempt using an intact
+ * mirror.
+ */
+ if (default_ok(m) || mirror_available(ms, bio)) {
+ bd = &bio_record->details;
+
+ dm_bio_restore(bd, bio);
+ bio_record->details.bi_bdev = NULL;
+
+ atomic_inc(&bio->bi_remaining);
+
+ queue_bio(ms, bio, rw);
+ return DM_ENDIO_INCOMPLETE;
+ }
+ DMERR("All replicated volumes dead, failing I/O");
+ }
+
+out:
+ bio_record->details.bi_bdev = NULL;
+
+ return error;
+}
+
+static void mirror_presuspend(struct dm_target *ti)
+{
+ struct mirror_set *ms = (struct mirror_set *) ti->private;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+
+ struct bio_list holds;
+ struct bio *bio;
+
+ atomic_set(&ms->suspend, 1);
+
+ /*
+ * Process bios in the hold list to start recovery waiting
+ * for bios in the hold list. After the process, no bio has
+ * a chance to be added in the hold list because ms->suspend
+ * is set.
+ */
+ spin_lock_irq(&ms->lock);
+ holds = ms->holds;
+ bio_list_init(&ms->holds);
+ spin_unlock_irq(&ms->lock);
+
+ while ((bio = bio_list_pop(&holds)))
+ hold_bio(ms, bio);
+
+ /*
+ * We must finish up all the work that we've
+ * generated (i.e. recovery work).
+ */
+ dm_rh_stop_recovery(ms->rh);
+
+ wait_event(_kmirrord_recovery_stopped,
+ !dm_rh_recovery_in_flight(ms->rh));
+
+ if (log->type->presuspend && log->type->presuspend(log))
+ /* FIXME: need better error handling */
+ DMWARN("log presuspend failed");
+
+ /*
+ * Now that recovery is complete/stopped and the
+ * delayed bios are queued, we need to wait for
+ * the worker thread to complete. This way,
+ * we know that all of our I/O has been pushed.
+ */
+ flush_workqueue(ms->kmirrord_wq);
+}
+
+static void mirror_postsuspend(struct dm_target *ti)
+{
+ struct mirror_set *ms = ti->private;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+
+ if (log->type->postsuspend && log->type->postsuspend(log))
+ /* FIXME: need better error handling */
+ DMWARN("log postsuspend failed");
+}
+
+static void mirror_resume(struct dm_target *ti)
+{
+ struct mirror_set *ms = ti->private;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+
+ atomic_set(&ms->suspend, 0);
+ if (log->type->resume && log->type->resume(log))
+ /* FIXME: need better error handling */
+ DMWARN("log resume failed");
+ dm_rh_start_recovery(ms->rh);
+}
+
+/*
+ * device_status_char
+ * @m: mirror device/leg we want the status of
+ *
+ * We return one character representing the most severe error
+ * we have encountered.
+ * A => Alive - No failures
+ * D => Dead - A write failure occurred leaving mirror out-of-sync
+ * S => Sync - A sychronization failure occurred, mirror out-of-sync
+ * R => Read - A read failure occurred, mirror data unaffected
+ *
+ * Returns: <char>
+ */
+static char device_status_char(struct mirror *m)
+{
+ if (!atomic_read(&(m->error_count)))
+ return 'A';
+
+ return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
+ (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
+ (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
+ (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
+}
+
+
+static void mirror_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ unsigned int m, sz = 0;
+ struct mirror_set *ms = (struct mirror_set *) ti->private;
+ struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+ char buffer[ms->nr_mirrors + 1];
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%d ", ms->nr_mirrors);
+ for (m = 0; m < ms->nr_mirrors; m++) {
+ DMEMIT("%s ", ms->mirror[m].dev->name);
+ buffer[m] = device_status_char(&(ms->mirror[m]));
+ }
+ buffer[m] = '\0';
+
+ DMEMIT("%llu/%llu 1 %s ",
+ (unsigned long long)log->type->get_sync_count(log),
+ (unsigned long long)ms->nr_regions, buffer);
+
+ sz += log->type->status(log, type, result+sz, maxlen-sz);
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ sz = log->type->status(log, type, result, maxlen);
+
+ DMEMIT("%d", ms->nr_mirrors);
+ for (m = 0; m < ms->nr_mirrors; m++)
+ DMEMIT(" %s %llu", ms->mirror[m].dev->name,
+ (unsigned long long)ms->mirror[m].offset);
+
+ if (ms->features & DM_RAID1_HANDLE_ERRORS)
+ DMEMIT(" 1 handle_errors");
+ }
+}
+
+static int mirror_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct mirror_set *ms = ti->private;
+ int ret = 0;
+ unsigned i;
+
+ for (i = 0; !ret && i < ms->nr_mirrors; i++)
+ ret = fn(ti, ms->mirror[i].dev,
+ ms->mirror[i].offset, ti->len, data);
+
+ return ret;
+}
+
+static struct target_type mirror_target = {
+ .name = "mirror",
+ .version = {1, 13, 2},
+ .module = THIS_MODULE,
+ .ctr = mirror_ctr,
+ .dtr = mirror_dtr,
+ .map = mirror_map,
+ .end_io = mirror_end_io,
+ .presuspend = mirror_presuspend,
+ .postsuspend = mirror_postsuspend,
+ .resume = mirror_resume,
+ .status = mirror_status,
+ .iterate_devices = mirror_iterate_devices,
+};
+
+static int __init dm_mirror_init(void)
+{
+ int r;
+
+ r = dm_register_target(&mirror_target);
+ if (r < 0) {
+ DMERR("Failed to register mirror target");
+ goto bad_target;
+ }
+
+ return 0;
+
+bad_target:
+ return r;
+}
+
+static void __exit dm_mirror_exit(void)
+{
+ dm_unregister_target(&mirror_target);
+}
+
+/* Module hooks */
+module_init(dm_mirror_init);
+module_exit(dm_mirror_exit);
+
+MODULE_DESCRIPTION(DM_NAME " mirror target");
+MODULE_AUTHOR("Joe Thornber");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
new file mode 100644
index 000000000..b929fd5f4
--- /dev/null
+++ b/drivers/md/dm-region-hash.c
@@ -0,0 +1,724 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-region-hash.h>
+
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "dm.h"
+
+#define DM_MSG_PREFIX "region hash"
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *
+ * The mirror splits itself up into discrete regions. Each
+ * region can be in one of three states: clean, dirty,
+ * nosync. There is no need to put clean regions in the hash.
+ *
+ * In addition to being present in the hash table a region _may_
+ * be present on one of three lists.
+ *
+ * clean_regions: Regions on this list have no io pending to
+ * them, they are in sync, we are no longer interested in them,
+ * they are dull. dm_rh_update_states() will remove them from the
+ * hash table.
+ *
+ * quiesced_regions: These regions have been spun down, ready
+ * for recovery. rh_recovery_start() will remove regions from
+ * this list and hand them to kmirrord, which will schedule the
+ * recovery io with kcopyd.
+ *
+ * recovered_regions: Regions that kcopyd has successfully
+ * recovered. dm_rh_update_states() will now schedule any delayed
+ * io, up the recovery_count, and remove the region from the
+ * hash.
+ *
+ * There are 2 locks:
+ * A rw spin lock 'hash_lock' protects just the hash table,
+ * this is never held in write mode from interrupt context,
+ * which I believe means that we only have to disable irqs when
+ * doing a write lock.
+ *
+ * An ordinary spin lock 'region_lock' that protects the three
+ * lists in the region_hash, with the 'state', 'list' and
+ * 'delayed_bios' fields of the regions. This is used from irq
+ * context, so all other uses will have to suspend local irqs.
+ *---------------------------------------------------------------*/
+struct dm_region_hash {
+ uint32_t region_size;
+ unsigned region_shift;
+
+ /* holds persistent region state */
+ struct dm_dirty_log *log;
+
+ /* hash table */
+ rwlock_t hash_lock;
+ mempool_t *region_pool;
+ unsigned mask;
+ unsigned nr_buckets;
+ unsigned prime;
+ unsigned shift;
+ struct list_head *buckets;
+
+ unsigned max_recovery; /* Max # of regions to recover in parallel */
+
+ spinlock_t region_lock;
+ atomic_t recovery_in_flight;
+ struct semaphore recovery_count;
+ struct list_head clean_regions;
+ struct list_head quiesced_regions;
+ struct list_head recovered_regions;
+ struct list_head failed_recovered_regions;
+
+ /*
+ * If there was a flush failure no regions can be marked clean.
+ */
+ int flush_failure;
+
+ void *context;
+ sector_t target_begin;
+
+ /* Callback function to schedule bios writes */
+ void (*dispatch_bios)(void *context, struct bio_list *bios);
+
+ /* Callback function to wakeup callers worker thread. */
+ void (*wakeup_workers)(void *context);
+
+ /* Callback function to wakeup callers recovery waiters. */
+ void (*wakeup_all_recovery_waiters)(void *context);
+};
+
+struct dm_region {
+ struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */
+ region_t key;
+ int state;
+
+ struct list_head hash_list;
+ struct list_head list;
+
+ atomic_t pending;
+ struct bio_list delayed_bios;
+};
+
+/*
+ * Conversion fns
+ */
+static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
+{
+ return sector >> rh->region_shift;
+}
+
+sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
+{
+ return region << rh->region_shift;
+}
+EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
+
+region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
+{
+ return dm_rh_sector_to_region(rh, bio->bi_iter.bi_sector -
+ rh->target_begin);
+}
+EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
+
+void *dm_rh_region_context(struct dm_region *reg)
+{
+ return reg->rh->context;
+}
+EXPORT_SYMBOL_GPL(dm_rh_region_context);
+
+region_t dm_rh_get_region_key(struct dm_region *reg)
+{
+ return reg->key;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
+
+sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
+{
+ return rh->region_size;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
+
+/*
+ * FIXME: shall we pass in a structure instead of all these args to
+ * dm_region_hash_create()????
+ */
+#define RH_HASH_MULT 2654435387U
+#define RH_HASH_SHIFT 12
+
+#define MIN_REGIONS 64
+struct dm_region_hash *dm_region_hash_create(
+ void *context, void (*dispatch_bios)(void *context,
+ struct bio_list *bios),
+ void (*wakeup_workers)(void *context),
+ void (*wakeup_all_recovery_waiters)(void *context),
+ sector_t target_begin, unsigned max_recovery,
+ struct dm_dirty_log *log, uint32_t region_size,
+ region_t nr_regions)
+{
+ struct dm_region_hash *rh;
+ unsigned nr_buckets, max_buckets;
+ size_t i;
+
+ /*
+ * Calculate a suitable number of buckets for our hash
+ * table.
+ */
+ max_buckets = nr_regions >> 6;
+ for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
+ ;
+ nr_buckets >>= 1;
+
+ rh = kmalloc(sizeof(*rh), GFP_KERNEL);
+ if (!rh) {
+ DMERR("unable to allocate region hash memory");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ rh->context = context;
+ rh->dispatch_bios = dispatch_bios;
+ rh->wakeup_workers = wakeup_workers;
+ rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters;
+ rh->target_begin = target_begin;
+ rh->max_recovery = max_recovery;
+ rh->log = log;
+ rh->region_size = region_size;
+ rh->region_shift = ffs(region_size) - 1;
+ rwlock_init(&rh->hash_lock);
+ rh->mask = nr_buckets - 1;
+ rh->nr_buckets = nr_buckets;
+
+ rh->shift = RH_HASH_SHIFT;
+ rh->prime = RH_HASH_MULT;
+
+ rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
+ if (!rh->buckets) {
+ DMERR("unable to allocate region hash bucket memory");
+ kfree(rh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < nr_buckets; i++)
+ INIT_LIST_HEAD(rh->buckets + i);
+
+ spin_lock_init(&rh->region_lock);
+ sema_init(&rh->recovery_count, 0);
+ atomic_set(&rh->recovery_in_flight, 0);
+ INIT_LIST_HEAD(&rh->clean_regions);
+ INIT_LIST_HEAD(&rh->quiesced_regions);
+ INIT_LIST_HEAD(&rh->recovered_regions);
+ INIT_LIST_HEAD(&rh->failed_recovered_regions);
+ rh->flush_failure = 0;
+
+ rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
+ sizeof(struct dm_region));
+ if (!rh->region_pool) {
+ vfree(rh->buckets);
+ kfree(rh);
+ rh = ERR_PTR(-ENOMEM);
+ }
+
+ return rh;
+}
+EXPORT_SYMBOL_GPL(dm_region_hash_create);
+
+void dm_region_hash_destroy(struct dm_region_hash *rh)
+{
+ unsigned h;
+ struct dm_region *reg, *nreg;
+
+ BUG_ON(!list_empty(&rh->quiesced_regions));
+ for (h = 0; h < rh->nr_buckets; h++) {
+ list_for_each_entry_safe(reg, nreg, rh->buckets + h,
+ hash_list) {
+ BUG_ON(atomic_read(&reg->pending));
+ mempool_free(reg, rh->region_pool);
+ }
+ }
+
+ if (rh->log)
+ dm_dirty_log_destroy(rh->log);
+
+ if (rh->region_pool)
+ mempool_destroy(rh->region_pool);
+
+ vfree(rh->buckets);
+ kfree(rh);
+}
+EXPORT_SYMBOL_GPL(dm_region_hash_destroy);
+
+struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh)
+{
+ return rh->log;
+}
+EXPORT_SYMBOL_GPL(dm_rh_dirty_log);
+
+static unsigned rh_hash(struct dm_region_hash *rh, region_t region)
+{
+ return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
+}
+
+static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region)
+{
+ struct dm_region *reg;
+ struct list_head *bucket = rh->buckets + rh_hash(rh, region);
+
+ list_for_each_entry(reg, bucket, hash_list)
+ if (reg->key == region)
+ return reg;
+
+ return NULL;
+}
+
+static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg)
+{
+ list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
+}
+
+static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
+{
+ struct dm_region *reg, *nreg;
+
+ nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+ if (unlikely(!nreg))
+ nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
+
+ nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
+ DM_RH_CLEAN : DM_RH_NOSYNC;
+ nreg->rh = rh;
+ nreg->key = region;
+ INIT_LIST_HEAD(&nreg->list);
+ atomic_set(&nreg->pending, 0);
+ bio_list_init(&nreg->delayed_bios);
+
+ write_lock_irq(&rh->hash_lock);
+ reg = __rh_lookup(rh, region);
+ if (reg)
+ /* We lost the race. */
+ mempool_free(nreg, rh->region_pool);
+ else {
+ __rh_insert(rh, nreg);
+ if (nreg->state == DM_RH_CLEAN) {
+ spin_lock(&rh->region_lock);
+ list_add(&nreg->list, &rh->clean_regions);
+ spin_unlock(&rh->region_lock);
+ }
+
+ reg = nreg;
+ }
+ write_unlock_irq(&rh->hash_lock);
+
+ return reg;
+}
+
+static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region)
+{
+ struct dm_region *reg;
+
+ reg = __rh_lookup(rh, region);
+ if (!reg) {
+ read_unlock(&rh->hash_lock);
+ reg = __rh_alloc(rh, region);
+ read_lock(&rh->hash_lock);
+ }
+
+ return reg;
+}
+
+int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block)
+{
+ int r;
+ struct dm_region *reg;
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_lookup(rh, region);
+ read_unlock(&rh->hash_lock);
+
+ if (reg)
+ return reg->state;
+
+ /*
+ * The region wasn't in the hash, so we fall back to the
+ * dirty log.
+ */
+ r = rh->log->type->in_sync(rh->log, region, may_block);
+
+ /*
+ * Any error from the dirty log (eg. -EWOULDBLOCK) gets
+ * taken as a DM_RH_NOSYNC
+ */
+ return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_state);
+
+static void complete_resync_work(struct dm_region *reg, int success)
+{
+ struct dm_region_hash *rh = reg->rh;
+
+ rh->log->type->set_region_sync(rh->log, reg->key, success);
+
+ /*
+ * Dispatch the bios before we call 'wake_up_all'.
+ * This is important because if we are suspending,
+ * we want to know that recovery is complete and
+ * the work queue is flushed. If we wake_up_all
+ * before we dispatch_bios (queue bios and call wake()),
+ * then we risk suspending before the work queue
+ * has been properly flushed.
+ */
+ rh->dispatch_bios(rh->context, &reg->delayed_bios);
+ if (atomic_dec_and_test(&rh->recovery_in_flight))
+ rh->wakeup_all_recovery_waiters(rh->context);
+ up(&rh->recovery_count);
+}
+
+/* dm_rh_mark_nosync
+ * @ms
+ * @bio
+ *
+ * The bio was written on some mirror(s) but failed on other mirror(s).
+ * We can successfully endio the bio but should avoid the region being
+ * marked clean by setting the state DM_RH_NOSYNC.
+ *
+ * This function is _not_ safe in interrupt context!
+ */
+void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
+{
+ unsigned long flags;
+ struct dm_dirty_log *log = rh->log;
+ struct dm_region *reg;
+ region_t region = dm_rh_bio_to_region(rh, bio);
+ int recovering = 0;
+
+ if (bio->bi_rw & REQ_FLUSH) {
+ rh->flush_failure = 1;
+ return;
+ }
+
+ if (bio->bi_rw & REQ_DISCARD)
+ return;
+
+ /* We must inform the log that the sync count has changed. */
+ log->type->set_region_sync(log, region, 0);
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_find(rh, region);
+ read_unlock(&rh->hash_lock);
+
+ /* region hash entry should exist because write was in-flight */
+ BUG_ON(!reg);
+ BUG_ON(!list_empty(&reg->list));
+
+ spin_lock_irqsave(&rh->region_lock, flags);
+ /*
+ * Possible cases:
+ * 1) DM_RH_DIRTY
+ * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed
+ * 3) DM_RH_RECOVERING: flushing pending writes
+ * Either case, the region should have not been connected to list.
+ */
+ recovering = (reg->state == DM_RH_RECOVERING);
+ reg->state = DM_RH_NOSYNC;
+ BUG_ON(!list_empty(&reg->list));
+ spin_unlock_irqrestore(&rh->region_lock, flags);
+
+ if (recovering)
+ complete_resync_work(reg, 0);
+}
+EXPORT_SYMBOL_GPL(dm_rh_mark_nosync);
+
+void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
+{
+ struct dm_region *reg, *next;
+
+ LIST_HEAD(clean);
+ LIST_HEAD(recovered);
+ LIST_HEAD(failed_recovered);
+
+ /*
+ * Quickly grab the lists.
+ */
+ write_lock_irq(&rh->hash_lock);
+ spin_lock(&rh->region_lock);
+ if (!list_empty(&rh->clean_regions)) {
+ list_splice_init(&rh->clean_regions, &clean);
+
+ list_for_each_entry(reg, &clean, list)
+ list_del(&reg->hash_list);
+ }
+
+ if (!list_empty(&rh->recovered_regions)) {
+ list_splice_init(&rh->recovered_regions, &recovered);
+
+ list_for_each_entry(reg, &recovered, list)
+ list_del(&reg->hash_list);
+ }
+
+ if (!list_empty(&rh->failed_recovered_regions)) {
+ list_splice_init(&rh->failed_recovered_regions,
+ &failed_recovered);
+
+ list_for_each_entry(reg, &failed_recovered, list)
+ list_del(&reg->hash_list);
+ }
+
+ spin_unlock(&rh->region_lock);
+ write_unlock_irq(&rh->hash_lock);
+
+ /*
+ * All the regions on the recovered and clean lists have
+ * now been pulled out of the system, so no need to do
+ * any more locking.
+ */
+ list_for_each_entry_safe(reg, next, &recovered, list) {
+ rh->log->type->clear_region(rh->log, reg->key);
+ complete_resync_work(reg, 1);
+ mempool_free(reg, rh->region_pool);
+ }
+
+ list_for_each_entry_safe(reg, next, &failed_recovered, list) {
+ complete_resync_work(reg, errors_handled ? 0 : 1);
+ mempool_free(reg, rh->region_pool);
+ }
+
+ list_for_each_entry_safe(reg, next, &clean, list) {
+ rh->log->type->clear_region(rh->log, reg->key);
+ mempool_free(reg, rh->region_pool);
+ }
+
+ rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL_GPL(dm_rh_update_states);
+
+static void rh_inc(struct dm_region_hash *rh, region_t region)
+{
+ struct dm_region *reg;
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_find(rh, region);
+
+ spin_lock_irq(&rh->region_lock);
+ atomic_inc(&reg->pending);
+
+ if (reg->state == DM_RH_CLEAN) {
+ reg->state = DM_RH_DIRTY;
+ list_del_init(&reg->list); /* take off the clean list */
+ spin_unlock_irq(&rh->region_lock);
+
+ rh->log->type->mark_region(rh->log, reg->key);
+ } else
+ spin_unlock_irq(&rh->region_lock);
+
+
+ read_unlock(&rh->hash_lock);
+}
+
+void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
+{
+ struct bio *bio;
+
+ for (bio = bios->head; bio; bio = bio->bi_next) {
+ if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))
+ continue;
+ rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+ }
+}
+EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
+
+void dm_rh_dec(struct dm_region_hash *rh, region_t region)
+{
+ unsigned long flags;
+ struct dm_region *reg;
+ int should_wake = 0;
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_lookup(rh, region);
+ read_unlock(&rh->hash_lock);
+
+ spin_lock_irqsave(&rh->region_lock, flags);
+ if (atomic_dec_and_test(&reg->pending)) {
+ /*
+ * There is no pending I/O for this region.
+ * We can move the region to corresponding list for next action.
+ * At this point, the region is not yet connected to any list.
+ *
+ * If the state is DM_RH_NOSYNC, the region should be kept off
+ * from clean list.
+ * The hash entry for DM_RH_NOSYNC will remain in memory
+ * until the region is recovered or the map is reloaded.
+ */
+
+ /* do nothing for DM_RH_NOSYNC */
+ if (unlikely(rh->flush_failure)) {
+ /*
+ * If a write flush failed some time ago, we
+ * don't know whether or not this write made it
+ * to the disk, so we must resync the device.
+ */
+ reg->state = DM_RH_NOSYNC;
+ } else if (reg->state == DM_RH_RECOVERING) {
+ list_add_tail(&reg->list, &rh->quiesced_regions);
+ } else if (reg->state == DM_RH_DIRTY) {
+ reg->state = DM_RH_CLEAN;
+ list_add(&reg->list, &rh->clean_regions);
+ }
+ should_wake = 1;
+ }
+ spin_unlock_irqrestore(&rh->region_lock, flags);
+
+ if (should_wake)
+ rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_dec);
+
+/*
+ * Starts quiescing a region in preparation for recovery.
+ */
+static int __rh_recovery_prepare(struct dm_region_hash *rh)
+{
+ int r;
+ region_t region;
+ struct dm_region *reg;
+
+ /*
+ * Ask the dirty log what's next.
+ */
+ r = rh->log->type->get_resync_work(rh->log, &region);
+ if (r <= 0)
+ return r;
+
+ /*
+ * Get this region, and start it quiescing by setting the
+ * recovering flag.
+ */
+ read_lock(&rh->hash_lock);
+ reg = __rh_find(rh, region);
+ read_unlock(&rh->hash_lock);
+
+ spin_lock_irq(&rh->region_lock);
+ reg->state = DM_RH_RECOVERING;
+
+ /* Already quiesced ? */
+ if (atomic_read(&reg->pending))
+ list_del_init(&reg->list);
+ else
+ list_move(&reg->list, &rh->quiesced_regions);
+
+ spin_unlock_irq(&rh->region_lock);
+
+ return 1;
+}
+
+void dm_rh_recovery_prepare(struct dm_region_hash *rh)
+{
+ /* Extra reference to avoid race with dm_rh_stop_recovery */
+ atomic_inc(&rh->recovery_in_flight);
+
+ while (!down_trylock(&rh->recovery_count)) {
+ atomic_inc(&rh->recovery_in_flight);
+ if (__rh_recovery_prepare(rh) <= 0) {
+ atomic_dec(&rh->recovery_in_flight);
+ up(&rh->recovery_count);
+ break;
+ }
+ }
+
+ /* Drop the extra reference */
+ if (atomic_dec_and_test(&rh->recovery_in_flight))
+ rh->wakeup_all_recovery_waiters(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
+
+/*
+ * Returns any quiesced regions.
+ */
+struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh)
+{
+ struct dm_region *reg = NULL;
+
+ spin_lock_irq(&rh->region_lock);
+ if (!list_empty(&rh->quiesced_regions)) {
+ reg = list_entry(rh->quiesced_regions.next,
+ struct dm_region, list);
+ list_del_init(&reg->list); /* remove from the quiesced list */
+ }
+ spin_unlock_irq(&rh->region_lock);
+
+ return reg;
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
+
+void dm_rh_recovery_end(struct dm_region *reg, int success)
+{
+ struct dm_region_hash *rh = reg->rh;
+
+ spin_lock_irq(&rh->region_lock);
+ if (success)
+ list_add(&reg->list, &reg->rh->recovered_regions);
+ else
+ list_add(&reg->list, &reg->rh->failed_recovered_regions);
+
+ spin_unlock_irq(&rh->region_lock);
+
+ rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
+
+/* Return recovery in flight count. */
+int dm_rh_recovery_in_flight(struct dm_region_hash *rh)
+{
+ return atomic_read(&rh->recovery_in_flight);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
+
+int dm_rh_flush(struct dm_region_hash *rh)
+{
+ return rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL_GPL(dm_rh_flush);
+
+void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
+{
+ struct dm_region *reg;
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio));
+ bio_list_add(&reg->delayed_bios, bio);
+ read_unlock(&rh->hash_lock);
+}
+EXPORT_SYMBOL_GPL(dm_rh_delay);
+
+void dm_rh_stop_recovery(struct dm_region_hash *rh)
+{
+ int i;
+
+ /* wait for any recovering regions */
+ for (i = 0; i < rh->max_recovery; i++)
+ down(&rh->recovery_count);
+}
+EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
+
+void dm_rh_start_recovery(struct dm_region_hash *rh)
+{
+ int i;
+
+ for (i = 0; i < rh->max_recovery; i++)
+ up(&rh->recovery_count);
+
+ rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
+
+MODULE_DESCRIPTION(DM_NAME " region hash");
+MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
new file mode 100644
index 000000000..6ab1192cd
--- /dev/null
+++ b/drivers/md/dm-round-robin.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2003 Sistina Software.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen
+ *
+ * This file is released under the GPL.
+ *
+ * Round-robin path selector.
+ */
+
+#include <linux/device-mapper.h>
+
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "multipath round-robin"
+
+/*-----------------------------------------------------------------
+ * Path-handling code, paths are held in lists
+ *---------------------------------------------------------------*/
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned repeat_count;
+};
+
+static void free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ kfree(pi);
+ }
+}
+
+/*-----------------------------------------------------------------
+ * Round-robin selector
+ *---------------------------------------------------------------*/
+
+#define RR_MIN_IO 1000
+
+struct selector {
+ struct list_head valid_paths;
+ struct list_head invalid_paths;
+};
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->invalid_paths);
+ }
+
+ return s;
+}
+
+static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector *s;
+
+ s = alloc_selector();
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+ return 0;
+}
+
+static void rr_destroy(struct path_selector *ps)
+{
+ struct selector *s = (struct selector *) ps->context;
+
+ free_paths(&s->valid_paths);
+ free_paths(&s->invalid_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int rr_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned int maxlen)
+{
+ struct path_info *pi;
+ int sz = 0;
+
+ if (!path)
+ DMEMIT("0 ");
+ else {
+ switch(type) {
+ case STATUSTYPE_INFO:
+ break;
+ case STATUSTYPE_TABLE:
+ pi = path->pscontext;
+ DMEMIT("%u ", pi->repeat_count);
+ break;
+ }
+ }
+
+ return sz;
+}
+
+/*
+ * Called during initialisation to register each path with an
+ * optional repeat_count.
+ */
+static int rr_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi;
+ unsigned repeat_count = RR_MIN_IO;
+ char dummy;
+
+ if (argc > 1) {
+ *error = "round-robin ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ /* First path argument is number of I/Os before switching path */
+ if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+ *error = "round-robin ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ /* allocate the path */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "round-robin ps: Error allocating path context";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+
+ path->pscontext = pi;
+
+ list_add_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = p->pscontext;
+
+ list_move(&pi->list, &s->invalid_paths);
+}
+
+static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = p->pscontext;
+
+ list_move(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static struct dm_path *rr_select_path(struct path_selector *ps,
+ unsigned *repeat_count, size_t nr_bytes)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = NULL;
+
+ if (!list_empty(&s->valid_paths)) {
+ pi = list_entry(s->valid_paths.next, struct path_info, list);
+ list_move_tail(&pi->list, &s->valid_paths);
+ *repeat_count = pi->repeat_count;
+ }
+
+ return pi ? pi->path : NULL;
+}
+
+static struct path_selector_type rr_ps = {
+ .name = "round-robin",
+ .module = THIS_MODULE,
+ .table_args = 1,
+ .info_args = 0,
+ .create = rr_create,
+ .destroy = rr_destroy,
+ .status = rr_status,
+ .add_path = rr_add_path,
+ .fail_path = rr_fail_path,
+ .reinstate_path = rr_reinstate_path,
+ .select_path = rr_select_path,
+};
+
+static int __init dm_rr_init(void)
+{
+ int r = dm_register_path_selector(&rr_ps);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ DMINFO("version 1.0.0 loaded");
+
+ return r;
+}
+
+static void __exit dm_rr_exit(void)
+{
+ int r = dm_unregister_path_selector(&rr_ps);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+}
+
+module_init(dm_rr_init);
+module_exit(dm_rr_exit);
+
+MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
+MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
new file mode 100644
index 000000000..9df8f6bd6
--- /dev/null
+++ b/drivers/md/dm-service-time.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "multipath service-time"
+#define ST_MIN_IO 1
+#define ST_MAX_RELATIVE_THROUGHPUT 100
+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7
+#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
+#define ST_VERSION "0.2.0"
+
+struct selector {
+ struct list_head valid_paths;
+ struct list_head failed_paths;
+};
+
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned repeat_count;
+ unsigned relative_throughput;
+ atomic_t in_flight_size; /* Total size of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->failed_paths);
+ }
+
+ return s;
+}
+
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector *s = alloc_selector();
+
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+ return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ kfree(pi);
+ }
+}
+
+static void st_destroy(struct path_selector *ps)
+{
+ struct selector *s = ps->context;
+
+ free_paths(&s->valid_paths);
+ free_paths(&s->failed_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int st_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned maxlen)
+{
+ unsigned sz = 0;
+ struct path_info *pi;
+
+ if (!path)
+ DMEMIT("0 ");
+ else {
+ pi = path->pscontext;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
+ pi->relative_throughput);
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %u ", pi->repeat_count,
+ pi->relative_throughput);
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi;
+ unsigned repeat_count = ST_MIN_IO;
+ unsigned relative_throughput = 1;
+ char dummy;
+
+ /*
+ * Arguments: [<repeat_count> [<relative_throughput>]]
+ * <repeat_count>: The number of I/Os before switching path.
+ * If not given, default (ST_MIN_IO) is used.
+ * <relative_throughput>: The relative throughput value of
+ * the path among all paths in the path-group.
+ * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
+ * If not given, minimum value '1' is used.
+ * If '0' is given, the path isn't selected while
+ * other paths having a positive value are
+ * available.
+ */
+ if (argc > 2) {
+ *error = "service-time ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
+ *error = "service-time ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ if ((argc == 2) &&
+ (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
+ relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
+ *error = "service-time ps: invalid relative_throughput value";
+ return -EINVAL;
+ }
+
+ /* allocate the path */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "service-time ps: Error allocating path context";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+ pi->relative_throughput = relative_throughput;
+ atomic_set(&pi->in_flight_size, 0);
+
+ path->pscontext = pi;
+
+ list_add_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move(&pi->list, &s->failed_paths);
+}
+
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
+ * Returns:
+ * < 0 : pi1 is better
+ * 0 : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ *
+ * Description:
+ * Basically, the service time is estimated by:
+ * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
+ * To reduce the calculation, some optimizations are made.
+ * (See comments inline)
+ */
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+ size_t incoming)
+{
+ size_t sz1, sz2, st1, st2;
+
+ sz1 = atomic_read(&pi1->in_flight_size);
+ sz2 = atomic_read(&pi2->in_flight_size);
+
+ /*
+ * Case 1: Both have same throughput value. Choose less loaded path.
+ */
+ if (pi1->relative_throughput == pi2->relative_throughput)
+ return sz1 - sz2;
+
+ /*
+ * Case 2a: Both have same load. Choose higher throughput path.
+ * Case 2b: One path has no throughput value. Choose the other one.
+ */
+ if (sz1 == sz2 ||
+ !pi1->relative_throughput || !pi2->relative_throughput)
+ return pi2->relative_throughput - pi1->relative_throughput;
+
+ /*
+ * Case 3: Calculate service time. Choose faster path.
+ * Service time using pi1:
+ * st1 = (sz1 + incoming) / pi1->relative_throughput
+ * Service time using pi2:
+ * st2 = (sz2 + incoming) / pi2->relative_throughput
+ *
+ * To avoid the division, transform the expression to use
+ * multiplication.
+ * Because ->relative_throughput > 0 here, if st1 < st2,
+ * the expressions below are the same meaning:
+ * (sz1 + incoming) / pi1->relative_throughput <
+ * (sz2 + incoming) / pi2->relative_throughput
+ * (sz1 + incoming) * pi2->relative_throughput <
+ * (sz2 + incoming) * pi1->relative_throughput
+ * So use the later one.
+ */
+ sz1 += incoming;
+ sz2 += incoming;
+ if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
+ sz2 >= ST_MAX_INFLIGHT_SIZE)) {
+ /*
+ * Size may be too big for multiplying pi->relative_throughput
+ * and overflow.
+ * To avoid the overflow and mis-selection, shift down both.
+ */
+ sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+ sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+ }
+ st1 = sz1 * pi2->relative_throughput;
+ st2 = sz2 * pi1->relative_throughput;
+ if (st1 != st2)
+ return st1 - st2;
+
+ /*
+ * Case 4: Service time is equal. Choose higher throughput path.
+ */
+ return pi2->relative_throughput - pi1->relative_throughput;
+}
+
+static struct dm_path *st_select_path(struct path_selector *ps,
+ unsigned *repeat_count, size_t nr_bytes)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = NULL, *best = NULL;
+
+ if (list_empty(&s->valid_paths))
+ return NULL;
+
+ /* Change preferred (first in list) path to evenly balance. */
+ list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+ list_for_each_entry(pi, &s->valid_paths, list)
+ if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
+ best = pi;
+
+ if (!best)
+ return NULL;
+
+ *repeat_count = best->repeat_count;
+
+ return best->path;
+}
+
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_add(nr_bytes, &pi->in_flight_size);
+
+ return 0;
+}
+
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_sub(nr_bytes, &pi->in_flight_size);
+
+ return 0;
+}
+
+static struct path_selector_type st_ps = {
+ .name = "service-time",
+ .module = THIS_MODULE,
+ .table_args = 2,
+ .info_args = 2,
+ .create = st_create,
+ .destroy = st_destroy,
+ .status = st_status,
+ .add_path = st_add_path,
+ .fail_path = st_fail_path,
+ .reinstate_path = st_reinstate_path,
+ .select_path = st_select_path,
+ .start_io = st_start_io,
+ .end_io = st_end_io,
+};
+
+static int __init dm_st_init(void)
+{
+ int r = dm_register_path_selector(&st_ps);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ DMINFO("version " ST_VERSION " loaded");
+
+ return r;
+}
+
+static void __exit dm_st_exit(void)
+{
+ int r = dm_unregister_path_selector(&st_ps);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+}
+
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
new file mode 100644
index 000000000..808b8419b
--- /dev/null
+++ b/drivers/md/dm-snap-persistent.c
@@ -0,0 +1,952 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-exception-store.h"
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+#include "dm-bufio.h"
+
+#define DM_MSG_PREFIX "persistent snapshot"
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
+
+#define DM_PREFETCH_CHUNKS 12
+
+/*-----------------------------------------------------------------
+ * Persistent snapshots, by persistent we mean that the snapshot
+ * will survive a reboot.
+ *---------------------------------------------------------------*/
+
+/*
+ * We need to store a record of which parts of the origin have
+ * been copied to the snapshot device. The snapshot code
+ * requires that we copy exception chunks to chunk aligned areas
+ * of the COW store. It makes sense therefore, to store the
+ * metadata in chunk size blocks.
+ *
+ * There is no backward or forward compatibility implemented,
+ * snapshots with different disk versions than the kernel will
+ * not be usable. It is expected that "lvcreate" will blank out
+ * the start of a fresh COW device before calling the snapshot
+ * constructor.
+ *
+ * The first chunk of the COW device just contains the header.
+ * After this there is a chunk filled with exception metadata,
+ * followed by as many exception chunks as can fit in the
+ * metadata areas.
+ *
+ * All on disk structures are in little-endian format. The end
+ * of the exceptions info is indicated by an exception with a
+ * new_chunk of 0, which is invalid since it would point to the
+ * header chunk.
+ */
+
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+
+/*
+ * The on-disk version of the metadata.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+
+#define NUM_SNAPSHOT_HDR_CHUNKS 1
+
+struct disk_header {
+ __le32 magic;
+
+ /*
+ * Is this snapshot valid. There is no way of recovering
+ * an invalid snapshot.
+ */
+ __le32 valid;
+
+ /*
+ * Simple, incrementing version. no backward
+ * compatibility.
+ */
+ __le32 version;
+
+ /* In sectors */
+ __le32 chunk_size;
+} __packed;
+
+struct disk_exception {
+ __le64 old_chunk;
+ __le64 new_chunk;
+} __packed;
+
+struct core_exception {
+ uint64_t old_chunk;
+ uint64_t new_chunk;
+};
+
+struct commit_callback {
+ void (*callback)(void *, int success);
+ void *context;
+};
+
+/*
+ * The top level structure for a persistent exception store.
+ */
+struct pstore {
+ struct dm_exception_store *store;
+ int version;
+ int valid;
+ uint32_t exceptions_per_area;
+
+ /*
+ * Now that we have an asynchronous kcopyd there is no
+ * need for large chunk sizes, so it wont hurt to have a
+ * whole chunks worth of metadata in memory at once.
+ */
+ void *area;
+
+ /*
+ * An area of zeros used to clear the next area.
+ */
+ void *zero_area;
+
+ /*
+ * An area used for header. The header can be written
+ * concurrently with metadata (when invalidating the snapshot),
+ * so it needs a separate buffer.
+ */
+ void *header_area;
+
+ /*
+ * Used to keep track of which metadata area the data in
+ * 'chunk' refers to.
+ */
+ chunk_t current_area;
+
+ /*
+ * The next free chunk for an exception.
+ *
+ * When creating exceptions, all the chunks here and above are
+ * free. It holds the next chunk to be allocated. On rare
+ * occasions (e.g. after a system crash) holes can be left in
+ * the exception store because chunks can be committed out of
+ * order.
+ *
+ * When merging exceptions, it does not necessarily mean all the
+ * chunks here and above are free. It holds the value it would
+ * have held if all chunks had been committed in order of
+ * allocation. Consequently the value may occasionally be
+ * slightly too low, but since it's only used for 'status' and
+ * it can never reach its minimum value too early this doesn't
+ * matter.
+ */
+
+ chunk_t next_free;
+
+ /*
+ * The index of next free exception in the current
+ * metadata area.
+ */
+ uint32_t current_committed;
+
+ atomic_t pending_count;
+ uint32_t callback_count;
+ struct commit_callback *callbacks;
+ struct dm_io_client *io_client;
+
+ struct workqueue_struct *metadata_wq;
+};
+
+static int alloc_area(struct pstore *ps)
+{
+ int r = -ENOMEM;
+ size_t len;
+
+ len = ps->store->chunk_size << SECTOR_SHIFT;
+
+ /*
+ * Allocate the chunk_size block of memory that will hold
+ * a single metadata area.
+ */
+ ps->area = vmalloc(len);
+ if (!ps->area)
+ goto err_area;
+
+ ps->zero_area = vzalloc(len);
+ if (!ps->zero_area)
+ goto err_zero_area;
+
+ ps->header_area = vmalloc(len);
+ if (!ps->header_area)
+ goto err_header_area;
+
+ return 0;
+
+err_header_area:
+ vfree(ps->zero_area);
+
+err_zero_area:
+ vfree(ps->area);
+
+err_area:
+ return r;
+}
+
+static void free_area(struct pstore *ps)
+{
+ vfree(ps->area);
+ ps->area = NULL;
+ vfree(ps->zero_area);
+ ps->zero_area = NULL;
+ vfree(ps->header_area);
+ ps->header_area = NULL;
+}
+
+struct mdata_req {
+ struct dm_io_region *where;
+ struct dm_io_request *io_req;
+ struct work_struct work;
+ int result;
+};
+
+static void do_metadata(struct work_struct *work)
+{
+ struct mdata_req *req = container_of(work, struct mdata_req, work);
+
+ req->result = dm_io(req->io_req, 1, req->where, NULL);
+}
+
+/*
+ * Read or write a chunk aligned and sized block of data from a device.
+ */
+static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
+ int metadata)
+{
+ struct dm_io_region where = {
+ .bdev = dm_snap_cow(ps->store->snap)->bdev,
+ .sector = ps->store->chunk_size * chunk,
+ .count = ps->store->chunk_size,
+ };
+ struct dm_io_request io_req = {
+ .bi_rw = rw,
+ .mem.type = DM_IO_VMA,
+ .mem.ptr.vma = area,
+ .client = ps->io_client,
+ .notify.fn = NULL,
+ };
+ struct mdata_req req;
+
+ if (!metadata)
+ return dm_io(&io_req, 1, &where, NULL);
+
+ req.where = &where;
+ req.io_req = &io_req;
+
+ /*
+ * Issue the synchronous I/O from a different thread
+ * to avoid generic_make_request recursion.
+ */
+ INIT_WORK_ONSTACK(&req.work, do_metadata);
+ queue_work(ps->metadata_wq, &req.work);
+ flush_workqueue(ps->metadata_wq);
+ destroy_work_on_stack(&req.work);
+
+ return req.result;
+}
+
+/*
+ * Convert a metadata area index to a chunk index.
+ */
+static chunk_t area_location(struct pstore *ps, chunk_t area)
+{
+ return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area);
+}
+
+static void skip_metadata(struct pstore *ps)
+{
+ uint32_t stride = ps->exceptions_per_area + 1;
+ chunk_t next_free = ps->next_free;
+ if (sector_div(next_free, stride) == NUM_SNAPSHOT_HDR_CHUNKS)
+ ps->next_free++;
+}
+
+/*
+ * Read or write a metadata area. Remembering to skip the first
+ * chunk which holds the header.
+ */
+static int area_io(struct pstore *ps, int rw)
+{
+ int r;
+ chunk_t chunk;
+
+ chunk = area_location(ps, ps->current_area);
+
+ r = chunk_io(ps, ps->area, chunk, rw, 0);
+ if (r)
+ return r;
+
+ return 0;
+}
+
+static void zero_memory_area(struct pstore *ps)
+{
+ memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
+}
+
+static int zero_disk_area(struct pstore *ps, chunk_t area)
+{
+ return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0);
+}
+
+static int read_header(struct pstore *ps, int *new_snapshot)
+{
+ int r;
+ struct disk_header *dh;
+ unsigned chunk_size;
+ int chunk_size_supplied = 1;
+ char *chunk_err;
+
+ /*
+ * Use default chunk size (or logical_block_size, if larger)
+ * if none supplied
+ */
+ if (!ps->store->chunk_size) {
+ ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+ bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
+ bdev) >> 9);
+ ps->store->chunk_mask = ps->store->chunk_size - 1;
+ ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
+ chunk_size_supplied = 0;
+ }
+
+ ps->io_client = dm_io_client_create();
+ if (IS_ERR(ps->io_client))
+ return PTR_ERR(ps->io_client);
+
+ r = alloc_area(ps);
+ if (r)
+ return r;
+
+ r = chunk_io(ps, ps->header_area, 0, READ, 1);
+ if (r)
+ goto bad;
+
+ dh = ps->header_area;
+
+ if (le32_to_cpu(dh->magic) == 0) {
+ *new_snapshot = 1;
+ return 0;
+ }
+
+ if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
+ DMWARN("Invalid or corrupt snapshot");
+ r = -ENXIO;
+ goto bad;
+ }
+
+ *new_snapshot = 0;
+ ps->valid = le32_to_cpu(dh->valid);
+ ps->version = le32_to_cpu(dh->version);
+ chunk_size = le32_to_cpu(dh->chunk_size);
+
+ if (ps->store->chunk_size == chunk_size)
+ return 0;
+
+ if (chunk_size_supplied)
+ DMWARN("chunk size %u in device metadata overrides "
+ "table chunk size of %u.",
+ chunk_size, ps->store->chunk_size);
+
+ /* We had a bogus chunk_size. Fix stuff up. */
+ free_area(ps);
+
+ r = dm_exception_store_set_chunk_size(ps->store, chunk_size,
+ &chunk_err);
+ if (r) {
+ DMERR("invalid on-disk chunk size %u: %s.",
+ chunk_size, chunk_err);
+ return r;
+ }
+
+ r = alloc_area(ps);
+ return r;
+
+bad:
+ free_area(ps);
+ return r;
+}
+
+static int write_header(struct pstore *ps)
+{
+ struct disk_header *dh;
+
+ memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT);
+
+ dh = ps->header_area;
+ dh->magic = cpu_to_le32(SNAP_MAGIC);
+ dh->valid = cpu_to_le32(ps->valid);
+ dh->version = cpu_to_le32(ps->version);
+ dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
+
+ return chunk_io(ps, ps->header_area, 0, WRITE, 1);
+}
+
+/*
+ * Access functions for the disk exceptions, these do the endian conversions.
+ */
+static struct disk_exception *get_exception(struct pstore *ps, void *ps_area,
+ uint32_t index)
+{
+ BUG_ON(index >= ps->exceptions_per_area);
+
+ return ((struct disk_exception *) ps_area) + index;
+}
+
+static void read_exception(struct pstore *ps, void *ps_area,
+ uint32_t index, struct core_exception *result)
+{
+ struct disk_exception *de = get_exception(ps, ps_area, index);
+
+ /* copy it */
+ result->old_chunk = le64_to_cpu(de->old_chunk);
+ result->new_chunk = le64_to_cpu(de->new_chunk);
+}
+
+static void write_exception(struct pstore *ps,
+ uint32_t index, struct core_exception *e)
+{
+ struct disk_exception *de = get_exception(ps, ps->area, index);
+
+ /* copy it */
+ de->old_chunk = cpu_to_le64(e->old_chunk);
+ de->new_chunk = cpu_to_le64(e->new_chunk);
+}
+
+static void clear_exception(struct pstore *ps, uint32_t index)
+{
+ struct disk_exception *de = get_exception(ps, ps->area, index);
+
+ /* clear it */
+ de->old_chunk = 0;
+ de->new_chunk = 0;
+}
+
+/*
+ * Registers the exceptions that are present in the current area.
+ * 'full' is filled in to indicate if the area has been
+ * filled.
+ */
+static int insert_exceptions(struct pstore *ps, void *ps_area,
+ int (*callback)(void *callback_context,
+ chunk_t old, chunk_t new),
+ void *callback_context,
+ int *full)
+{
+ int r;
+ unsigned int i;
+ struct core_exception e;
+
+ /* presume the area is full */
+ *full = 1;
+
+ for (i = 0; i < ps->exceptions_per_area; i++) {
+ read_exception(ps, ps_area, i, &e);
+
+ /*
+ * If the new_chunk is pointing at the start of
+ * the COW device, where the first metadata area
+ * is we know that we've hit the end of the
+ * exceptions. Therefore the area is not full.
+ */
+ if (e.new_chunk == 0LL) {
+ ps->current_committed = i;
+ *full = 0;
+ break;
+ }
+
+ /*
+ * Keep track of the start of the free chunks.
+ */
+ if (ps->next_free <= e.new_chunk)
+ ps->next_free = e.new_chunk + 1;
+
+ /*
+ * Otherwise we add the exception to the snapshot.
+ */
+ r = callback(callback_context, e.old_chunk, e.new_chunk);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int read_exceptions(struct pstore *ps,
+ int (*callback)(void *callback_context, chunk_t old,
+ chunk_t new),
+ void *callback_context)
+{
+ int r, full = 1;
+ struct dm_bufio_client *client;
+ chunk_t prefetch_area = 0;
+
+ client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
+ ps->store->chunk_size << SECTOR_SHIFT,
+ 1, 0, NULL, NULL);
+
+ if (IS_ERR(client))
+ return PTR_ERR(client);
+
+ /*
+ * Setup for one current buffer + desired readahead buffers.
+ */
+ dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS);
+
+ /*
+ * Keeping reading chunks and inserting exceptions until
+ * we find a partially full area.
+ */
+ for (ps->current_area = 0; full; ps->current_area++) {
+ struct dm_buffer *bp;
+ void *area;
+ chunk_t chunk;
+
+ if (unlikely(prefetch_area < ps->current_area))
+ prefetch_area = ps->current_area;
+
+ if (DM_PREFETCH_CHUNKS) do {
+ chunk_t pf_chunk = area_location(ps, prefetch_area);
+ if (unlikely(pf_chunk >= dm_bufio_get_device_size(client)))
+ break;
+ dm_bufio_prefetch(client, pf_chunk, 1);
+ prefetch_area++;
+ if (unlikely(!prefetch_area))
+ break;
+ } while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS);
+
+ chunk = area_location(ps, ps->current_area);
+
+ area = dm_bufio_read(client, chunk, &bp);
+ if (unlikely(IS_ERR(area))) {
+ r = PTR_ERR(area);
+ goto ret_destroy_bufio;
+ }
+
+ r = insert_exceptions(ps, area, callback, callback_context,
+ &full);
+
+ if (!full)
+ memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
+
+ dm_bufio_release(bp);
+
+ dm_bufio_forget(client, chunk);
+
+ if (unlikely(r))
+ goto ret_destroy_bufio;
+ }
+
+ ps->current_area--;
+
+ skip_metadata(ps);
+
+ r = 0;
+
+ret_destroy_bufio:
+ dm_bufio_client_destroy(client);
+
+ return r;
+}
+
+static struct pstore *get_info(struct dm_exception_store *store)
+{
+ return (struct pstore *) store->context;
+}
+
+static void persistent_usage(struct dm_exception_store *store,
+ sector_t *total_sectors,
+ sector_t *sectors_allocated,
+ sector_t *metadata_sectors)
+{
+ struct pstore *ps = get_info(store);
+
+ *sectors_allocated = ps->next_free * store->chunk_size;
+ *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
+
+ /*
+ * First chunk is the fixed header.
+ * Then there are (ps->current_area + 1) metadata chunks, each one
+ * separated from the next by ps->exceptions_per_area data chunks.
+ */
+ *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
+ store->chunk_size;
+}
+
+static void persistent_dtr(struct dm_exception_store *store)
+{
+ struct pstore *ps = get_info(store);
+
+ destroy_workqueue(ps->metadata_wq);
+
+ /* Created in read_header */
+ if (ps->io_client)
+ dm_io_client_destroy(ps->io_client);
+ free_area(ps);
+
+ /* Allocated in persistent_read_metadata */
+ vfree(ps->callbacks);
+
+ kfree(ps);
+}
+
+static int persistent_read_metadata(struct dm_exception_store *store,
+ int (*callback)(void *callback_context,
+ chunk_t old, chunk_t new),
+ void *callback_context)
+{
+ int r, uninitialized_var(new_snapshot);
+ struct pstore *ps = get_info(store);
+
+ /*
+ * Read the snapshot header.
+ */
+ r = read_header(ps, &new_snapshot);
+ if (r)
+ return r;
+
+ /*
+ * Now we know correct chunk_size, complete the initialisation.
+ */
+ ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
+ sizeof(struct disk_exception);
+ ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
+ sizeof(*ps->callbacks));
+ if (!ps->callbacks)
+ return -ENOMEM;
+
+ /*
+ * Do we need to setup a new snapshot ?
+ */
+ if (new_snapshot) {
+ r = write_header(ps);
+ if (r) {
+ DMWARN("write_header failed");
+ return r;
+ }
+
+ ps->current_area = 0;
+ zero_memory_area(ps);
+ r = zero_disk_area(ps, 0);
+ if (r)
+ DMWARN("zero_disk_area(0) failed");
+ return r;
+ }
+ /*
+ * Sanity checks.
+ */
+ if (ps->version != SNAPSHOT_DISK_VERSION) {
+ DMWARN("unable to handle snapshot disk version %d",
+ ps->version);
+ return -EINVAL;
+ }
+
+ /*
+ * Metadata are valid, but snapshot is invalidated
+ */
+ if (!ps->valid)
+ return 1;
+
+ /*
+ * Read the metadata.
+ */
+ r = read_exceptions(ps, callback, callback_context);
+
+ return r;
+}
+
+static int persistent_prepare_exception(struct dm_exception_store *store,
+ struct dm_exception *e)
+{
+ struct pstore *ps = get_info(store);
+ sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
+
+ /* Is there enough room ? */
+ if (size < ((ps->next_free + 1) * store->chunk_size))
+ return -ENOSPC;
+
+ e->new_chunk = ps->next_free;
+
+ /*
+ * Move onto the next free pending, making sure to take
+ * into account the location of the metadata chunks.
+ */
+ ps->next_free++;
+ skip_metadata(ps);
+
+ atomic_inc(&ps->pending_count);
+ return 0;
+}
+
+static void persistent_commit_exception(struct dm_exception_store *store,
+ struct dm_exception *e,
+ void (*callback) (void *, int success),
+ void *callback_context)
+{
+ unsigned int i;
+ struct pstore *ps = get_info(store);
+ struct core_exception ce;
+ struct commit_callback *cb;
+
+ ce.old_chunk = e->old_chunk;
+ ce.new_chunk = e->new_chunk;
+ write_exception(ps, ps->current_committed++, &ce);
+
+ /*
+ * Add the callback to the back of the array. This code
+ * is the only place where the callback array is
+ * manipulated, and we know that it will never be called
+ * multiple times concurrently.
+ */
+ cb = ps->callbacks + ps->callback_count++;
+ cb->callback = callback;
+ cb->context = callback_context;
+
+ /*
+ * If there are exceptions in flight and we have not yet
+ * filled this metadata area there's nothing more to do.
+ */
+ if (!atomic_dec_and_test(&ps->pending_count) &&
+ (ps->current_committed != ps->exceptions_per_area))
+ return;
+
+ /*
+ * If we completely filled the current area, then wipe the next one.
+ */
+ if ((ps->current_committed == ps->exceptions_per_area) &&
+ zero_disk_area(ps, ps->current_area + 1))
+ ps->valid = 0;
+
+ /*
+ * Commit exceptions to disk.
+ */
+ if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
+ ps->valid = 0;
+
+ /*
+ * Advance to the next area if this one is full.
+ */
+ if (ps->current_committed == ps->exceptions_per_area) {
+ ps->current_committed = 0;
+ ps->current_area++;
+ zero_memory_area(ps);
+ }
+
+ for (i = 0; i < ps->callback_count; i++) {
+ cb = ps->callbacks + i;
+ cb->callback(cb->context, ps->valid);
+ }
+
+ ps->callback_count = 0;
+}
+
+static int persistent_prepare_merge(struct dm_exception_store *store,
+ chunk_t *last_old_chunk,
+ chunk_t *last_new_chunk)
+{
+ struct pstore *ps = get_info(store);
+ struct core_exception ce;
+ int nr_consecutive;
+ int r;
+
+ /*
+ * When current area is empty, move back to preceding area.
+ */
+ if (!ps->current_committed) {
+ /*
+ * Have we finished?
+ */
+ if (!ps->current_area)
+ return 0;
+
+ ps->current_area--;
+ r = area_io(ps, READ);
+ if (r < 0)
+ return r;
+ ps->current_committed = ps->exceptions_per_area;
+ }
+
+ read_exception(ps, ps->area, ps->current_committed - 1, &ce);
+ *last_old_chunk = ce.old_chunk;
+ *last_new_chunk = ce.new_chunk;
+
+ /*
+ * Find number of consecutive chunks within the current area,
+ * working backwards.
+ */
+ for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
+ nr_consecutive++) {
+ read_exception(ps, ps->area,
+ ps->current_committed - 1 - nr_consecutive, &ce);
+ if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
+ ce.new_chunk != *last_new_chunk - nr_consecutive)
+ break;
+ }
+
+ return nr_consecutive;
+}
+
+static int persistent_commit_merge(struct dm_exception_store *store,
+ int nr_merged)
+{
+ int r, i;
+ struct pstore *ps = get_info(store);
+
+ BUG_ON(nr_merged > ps->current_committed);
+
+ for (i = 0; i < nr_merged; i++)
+ clear_exception(ps, ps->current_committed - 1 - i);
+
+ r = area_io(ps, WRITE_FLUSH_FUA);
+ if (r < 0)
+ return r;
+
+ ps->current_committed -= nr_merged;
+
+ /*
+ * At this stage, only persistent_usage() uses ps->next_free, so
+ * we make no attempt to keep ps->next_free strictly accurate
+ * as exceptions may have been committed out-of-order originally.
+ * Once a snapshot has become merging, we set it to the value it
+ * would have held had all the exceptions been committed in order.
+ *
+ * ps->current_area does not get reduced by prepare_merge() until
+ * after commit_merge() has removed the nr_merged previous exceptions.
+ */
+ ps->next_free = area_location(ps, ps->current_area) +
+ ps->current_committed + 1;
+
+ return 0;
+}
+
+static void persistent_drop_snapshot(struct dm_exception_store *store)
+{
+ struct pstore *ps = get_info(store);
+
+ ps->valid = 0;
+ if (write_header(ps))
+ DMWARN("write header failed");
+}
+
+static int persistent_ctr(struct dm_exception_store *store,
+ unsigned argc, char **argv)
+{
+ struct pstore *ps;
+
+ /* allocate the pstore */
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ return -ENOMEM;
+
+ ps->store = store;
+ ps->valid = 1;
+ ps->version = SNAPSHOT_DISK_VERSION;
+ ps->area = NULL;
+ ps->zero_area = NULL;
+ ps->header_area = NULL;
+ ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
+ ps->current_committed = 0;
+
+ ps->callback_count = 0;
+ atomic_set(&ps->pending_count, 0);
+ ps->callbacks = NULL;
+
+ ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
+ if (!ps->metadata_wq) {
+ kfree(ps);
+ DMERR("couldn't start header metadata update thread");
+ return -ENOMEM;
+ }
+
+ store->context = ps;
+
+ return 0;
+}
+
+static unsigned persistent_status(struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen)
+{
+ unsigned sz = 0;
+
+ switch (status) {
+ case STATUSTYPE_INFO:
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
+ }
+
+ return sz;
+}
+
+static struct dm_exception_store_type _persistent_type = {
+ .name = "persistent",
+ .module = THIS_MODULE,
+ .ctr = persistent_ctr,
+ .dtr = persistent_dtr,
+ .read_metadata = persistent_read_metadata,
+ .prepare_exception = persistent_prepare_exception,
+ .commit_exception = persistent_commit_exception,
+ .prepare_merge = persistent_prepare_merge,
+ .commit_merge = persistent_commit_merge,
+ .drop_snapshot = persistent_drop_snapshot,
+ .usage = persistent_usage,
+ .status = persistent_status,
+};
+
+static struct dm_exception_store_type _persistent_compat_type = {
+ .name = "P",
+ .module = THIS_MODULE,
+ .ctr = persistent_ctr,
+ .dtr = persistent_dtr,
+ .read_metadata = persistent_read_metadata,
+ .prepare_exception = persistent_prepare_exception,
+ .commit_exception = persistent_commit_exception,
+ .prepare_merge = persistent_prepare_merge,
+ .commit_merge = persistent_commit_merge,
+ .drop_snapshot = persistent_drop_snapshot,
+ .usage = persistent_usage,
+ .status = persistent_status,
+};
+
+int dm_persistent_snapshot_init(void)
+{
+ int r;
+
+ r = dm_exception_store_type_register(&_persistent_type);
+ if (r) {
+ DMERR("Unable to register persistent exception store type");
+ return r;
+ }
+
+ r = dm_exception_store_type_register(&_persistent_compat_type);
+ if (r) {
+ DMERR("Unable to register old-style persistent exception "
+ "store type");
+ dm_exception_store_type_unregister(&_persistent_type);
+ return r;
+ }
+
+ return r;
+}
+
+void dm_persistent_snapshot_exit(void)
+{
+ dm_exception_store_type_unregister(&_persistent_type);
+ dm_exception_store_type_unregister(&_persistent_compat_type);
+}
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
new file mode 100644
index 000000000..1ce9a2586
--- /dev/null
+++ b/drivers/md/dm-snap-transient.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006-2008 Red Hat GmbH
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-exception-store.h"
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/dm-io.h>
+
+#define DM_MSG_PREFIX "transient snapshot"
+
+/*-----------------------------------------------------------------
+ * Implementation of the store for non-persistent snapshots.
+ *---------------------------------------------------------------*/
+struct transient_c {
+ sector_t next_free;
+};
+
+static void transient_dtr(struct dm_exception_store *store)
+{
+ kfree(store->context);
+}
+
+static int transient_read_metadata(struct dm_exception_store *store,
+ int (*callback)(void *callback_context,
+ chunk_t old, chunk_t new),
+ void *callback_context)
+{
+ return 0;
+}
+
+static int transient_prepare_exception(struct dm_exception_store *store,
+ struct dm_exception *e)
+{
+ struct transient_c *tc = store->context;
+ sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
+
+ if (size < (tc->next_free + store->chunk_size))
+ return -1;
+
+ e->new_chunk = sector_to_chunk(store, tc->next_free);
+ tc->next_free += store->chunk_size;
+
+ return 0;
+}
+
+static void transient_commit_exception(struct dm_exception_store *store,
+ struct dm_exception *e,
+ void (*callback) (void *, int success),
+ void *callback_context)
+{
+ /* Just succeed */
+ callback(callback_context, 1);
+}
+
+static void transient_usage(struct dm_exception_store *store,
+ sector_t *total_sectors,
+ sector_t *sectors_allocated,
+ sector_t *metadata_sectors)
+{
+ *sectors_allocated = ((struct transient_c *) store->context)->next_free;
+ *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
+ *metadata_sectors = 0;
+}
+
+static int transient_ctr(struct dm_exception_store *store,
+ unsigned argc, char **argv)
+{
+ struct transient_c *tc;
+
+ tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+ if (!tc)
+ return -ENOMEM;
+
+ tc->next_free = 0;
+ store->context = tc;
+
+ return 0;
+}
+
+static unsigned transient_status(struct dm_exception_store *store,
+ status_type_t status, char *result,
+ unsigned maxlen)
+{
+ unsigned sz = 0;
+
+ switch (status) {
+ case STATUSTYPE_INFO:
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT(" N %llu", (unsigned long long)store->chunk_size);
+ }
+
+ return sz;
+}
+
+static struct dm_exception_store_type _transient_type = {
+ .name = "transient",
+ .module = THIS_MODULE,
+ .ctr = transient_ctr,
+ .dtr = transient_dtr,
+ .read_metadata = transient_read_metadata,
+ .prepare_exception = transient_prepare_exception,
+ .commit_exception = transient_commit_exception,
+ .usage = transient_usage,
+ .status = transient_status,
+};
+
+static struct dm_exception_store_type _transient_compat_type = {
+ .name = "N",
+ .module = THIS_MODULE,
+ .ctr = transient_ctr,
+ .dtr = transient_dtr,
+ .read_metadata = transient_read_metadata,
+ .prepare_exception = transient_prepare_exception,
+ .commit_exception = transient_commit_exception,
+ .usage = transient_usage,
+ .status = transient_status,
+};
+
+int dm_transient_snapshot_init(void)
+{
+ int r;
+
+ r = dm_exception_store_type_register(&_transient_type);
+ if (r) {
+ DMWARN("Unable to register transient exception store type");
+ return r;
+ }
+
+ r = dm_exception_store_type_register(&_transient_compat_type);
+ if (r) {
+ DMWARN("Unable to register old-style transient "
+ "exception store type");
+ dm_exception_store_type_unregister(&_transient_type);
+ return r;
+ }
+
+ return r;
+}
+
+void dm_transient_snapshot_exit(void)
+{
+ dm_exception_store_type_unregister(&_transient_type);
+ dm_exception_store_type_unregister(&_transient_compat_type);
+}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
new file mode 100644
index 000000000..f83a0f3fc
--- /dev/null
+++ b/drivers/md/dm-snap.c
@@ -0,0 +1,2486 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/log2.h>
+#include <linux/dm-kcopyd.h>
+
+#include "dm.h"
+
+#include "dm-exception-store.h"
+
+#define DM_MSG_PREFIX "snapshots"
+
+static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
+
+#define dm_target_is_snapshot_merge(ti) \
+ ((ti)->type->name == dm_snapshot_merge_target_name)
+
+/*
+ * The size of the mempool used to track chunks in use.
+ */
+#define MIN_IOS 256
+
+#define DM_TRACKED_CHUNK_HASH_SIZE 16
+#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
+ (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+
+struct dm_exception_table {
+ uint32_t hash_mask;
+ unsigned hash_shift;
+ struct list_head *table;
+};
+
+struct dm_snapshot {
+ struct rw_semaphore lock;
+
+ struct dm_dev *origin;
+ struct dm_dev *cow;
+
+ struct dm_target *ti;
+
+ /* List of snapshots per Origin */
+ struct list_head list;
+
+ /*
+ * You can't use a snapshot if this is 0 (e.g. if full).
+ * A snapshot-merge target never clears this.
+ */
+ int valid;
+
+ /* Origin writes don't trigger exceptions until this is set */
+ int active;
+
+ atomic_t pending_exceptions_count;
+
+ /* Protected by "lock" */
+ sector_t exception_start_sequence;
+
+ /* Protected by kcopyd single-threaded callback */
+ sector_t exception_complete_sequence;
+
+ /*
+ * A list of pending exceptions that completed out of order.
+ * Protected by kcopyd single-threaded callback.
+ */
+ struct list_head out_of_order_list;
+
+ mempool_t *pending_pool;
+
+ struct dm_exception_table pending;
+ struct dm_exception_table complete;
+
+ /*
+ * pe_lock protects all pending_exception operations and access
+ * as well as the snapshot_bios list.
+ */
+ spinlock_t pe_lock;
+
+ /* Chunks with outstanding reads */
+ spinlock_t tracked_chunk_lock;
+ struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
+
+ /* The on disk metadata handler */
+ struct dm_exception_store *store;
+
+ struct dm_kcopyd_client *kcopyd_client;
+
+ /* Wait for events based on state_bits */
+ unsigned long state_bits;
+
+ /* Range of chunks currently being merged. */
+ chunk_t first_merging_chunk;
+ int num_merging_chunks;
+
+ /*
+ * The merge operation failed if this flag is set.
+ * Failure modes are handled as follows:
+ * - I/O error reading the header
+ * => don't load the target; abort.
+ * - Header does not have "valid" flag set
+ * => use the origin; forget about the snapshot.
+ * - I/O error when reading exceptions
+ * => don't load the target; abort.
+ * (We can't use the intermediate origin state.)
+ * - I/O error while merging
+ * => stop merging; set merge_failed; process I/O normally.
+ */
+ int merge_failed;
+
+ /*
+ * Incoming bios that overlap with chunks being merged must wait
+ * for them to be committed.
+ */
+ struct bio_list bios_queued_during_merge;
+};
+
+/*
+ * state_bits:
+ * RUNNING_MERGE - Merge operation is in progress.
+ * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
+ * cleared afterwards.
+ */
+#define RUNNING_MERGE 0
+#define SHUTDOWN_MERGE 1
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
+ "A percentage of time allocated for copy on write");
+
+struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
+{
+ return s->origin;
+}
+EXPORT_SYMBOL(dm_snap_origin);
+
+struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
+{
+ return s->cow;
+}
+EXPORT_SYMBOL(dm_snap_cow);
+
+static sector_t chunk_to_sector(struct dm_exception_store *store,
+ chunk_t chunk)
+{
+ return chunk << store->chunk_shift;
+}
+
+static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
+{
+ /*
+ * There is only ever one instance of a particular block
+ * device so we can compare pointers safely.
+ */
+ return lhs == rhs;
+}
+
+struct dm_snap_pending_exception {
+ struct dm_exception e;
+
+ /*
+ * Origin buffers waiting for this to complete are held
+ * in a bio list
+ */
+ struct bio_list origin_bios;
+ struct bio_list snapshot_bios;
+
+ /* Pointer back to snapshot context */
+ struct dm_snapshot *snap;
+
+ /*
+ * 1 indicates the exception has already been sent to
+ * kcopyd.
+ */
+ int started;
+
+ /* There was copying error. */
+ int copy_error;
+
+ /* A sequence number, it is used for in-order completion. */
+ sector_t exception_sequence;
+
+ struct list_head out_of_order_entry;
+
+ /*
+ * For writing a complete chunk, bypassing the copy.
+ */
+ struct bio *full_bio;
+ bio_end_io_t *full_bio_end_io;
+ void *full_bio_private;
+};
+
+/*
+ * Hash table mapping origin volumes to lists of snapshots and
+ * a lock to protect it
+ */
+static struct kmem_cache *exception_cache;
+static struct kmem_cache *pending_cache;
+
+struct dm_snap_tracked_chunk {
+ struct hlist_node node;
+ chunk_t chunk;
+};
+
+static void init_tracked_chunk(struct bio *bio)
+{
+ struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+ INIT_HLIST_NODE(&c->node);
+}
+
+static bool is_bio_tracked(struct bio *bio)
+{
+ struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+ return !hlist_unhashed(&c->node);
+}
+
+static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
+{
+ struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+
+ c->chunk = chunk;
+
+ spin_lock_irq(&s->tracked_chunk_lock);
+ hlist_add_head(&c->node,
+ &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
+ spin_unlock_irq(&s->tracked_chunk_lock);
+}
+
+static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
+{
+ struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
+ unsigned long flags;
+
+ spin_lock_irqsave(&s->tracked_chunk_lock, flags);
+ hlist_del(&c->node);
+ spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
+}
+
+static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
+{
+ struct dm_snap_tracked_chunk *c;
+ int found = 0;
+
+ spin_lock_irq(&s->tracked_chunk_lock);
+
+ hlist_for_each_entry(c,
+ &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
+ if (c->chunk == chunk) {
+ found = 1;
+ break;
+ }
+ }
+
+ spin_unlock_irq(&s->tracked_chunk_lock);
+
+ return found;
+}
+
+/*
+ * This conflicting I/O is extremely improbable in the caller,
+ * so msleep(1) is sufficient and there is no need for a wait queue.
+ */
+static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
+{
+ while (__chunk_is_tracked(s, chunk))
+ msleep(1);
+}
+
+/*
+ * One of these per registered origin, held in the snapshot_origins hash
+ */
+struct origin {
+ /* The origin device */
+ struct block_device *bdev;
+
+ struct list_head hash_list;
+
+ /* List of snapshots for this origin */
+ struct list_head snapshots;
+};
+
+/*
+ * This structure is allocated for each origin target
+ */
+struct dm_origin {
+ struct dm_dev *dev;
+ struct dm_target *ti;
+ unsigned split_boundary;
+ struct list_head hash_list;
+};
+
+/*
+ * Size of the hash table for origin volumes. If we make this
+ * the size of the minors list then it should be nearly perfect
+ */
+#define ORIGIN_HASH_SIZE 256
+#define ORIGIN_MASK 0xFF
+static struct list_head *_origins;
+static struct list_head *_dm_origins;
+static struct rw_semaphore _origins_lock;
+
+static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
+static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
+static uint64_t _pending_exceptions_done_count;
+
+static int init_origin_hash(void)
+{
+ int i;
+
+ _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!_origins) {
+ DMERR("unable to allocate memory for _origins");
+ return -ENOMEM;
+ }
+ for (i = 0; i < ORIGIN_HASH_SIZE; i++)
+ INIT_LIST_HEAD(_origins + i);
+
+ _dm_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!_dm_origins) {
+ DMERR("unable to allocate memory for _dm_origins");
+ kfree(_origins);
+ return -ENOMEM;
+ }
+ for (i = 0; i < ORIGIN_HASH_SIZE; i++)
+ INIT_LIST_HEAD(_dm_origins + i);
+
+ init_rwsem(&_origins_lock);
+
+ return 0;
+}
+
+static void exit_origin_hash(void)
+{
+ kfree(_origins);
+ kfree(_dm_origins);
+}
+
+static unsigned origin_hash(struct block_device *bdev)
+{
+ return bdev->bd_dev & ORIGIN_MASK;
+}
+
+static struct origin *__lookup_origin(struct block_device *origin)
+{
+ struct list_head *ol;
+ struct origin *o;
+
+ ol = &_origins[origin_hash(origin)];
+ list_for_each_entry (o, ol, hash_list)
+ if (bdev_equal(o->bdev, origin))
+ return o;
+
+ return NULL;
+}
+
+static void __insert_origin(struct origin *o)
+{
+ struct list_head *sl = &_origins[origin_hash(o->bdev)];
+ list_add_tail(&o->hash_list, sl);
+}
+
+static struct dm_origin *__lookup_dm_origin(struct block_device *origin)
+{
+ struct list_head *ol;
+ struct dm_origin *o;
+
+ ol = &_dm_origins[origin_hash(origin)];
+ list_for_each_entry (o, ol, hash_list)
+ if (bdev_equal(o->dev->bdev, origin))
+ return o;
+
+ return NULL;
+}
+
+static void __insert_dm_origin(struct dm_origin *o)
+{
+ struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
+ list_add_tail(&o->hash_list, sl);
+}
+
+static void __remove_dm_origin(struct dm_origin *o)
+{
+ list_del(&o->hash_list);
+}
+
+/*
+ * _origins_lock must be held when calling this function.
+ * Returns number of snapshots registered using the supplied cow device, plus:
+ * snap_src - a snapshot suitable for use as a source of exception handover
+ * snap_dest - a snapshot capable of receiving exception handover.
+ * snap_merge - an existing snapshot-merge target linked to the same origin.
+ * There can be at most one snapshot-merge target. The parameter is optional.
+ *
+ * Possible return values and states of snap_src and snap_dest.
+ * 0: NULL, NULL - first new snapshot
+ * 1: snap_src, NULL - normal snapshot
+ * 2: snap_src, snap_dest - waiting for handover
+ * 2: snap_src, NULL - handed over, waiting for old to be deleted
+ * 1: NULL, snap_dest - source got destroyed without handover
+ */
+static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
+ struct dm_snapshot **snap_src,
+ struct dm_snapshot **snap_dest,
+ struct dm_snapshot **snap_merge)
+{
+ struct dm_snapshot *s;
+ struct origin *o;
+ int count = 0;
+ int active;
+
+ o = __lookup_origin(snap->origin->bdev);
+ if (!o)
+ goto out;
+
+ list_for_each_entry(s, &o->snapshots, list) {
+ if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
+ *snap_merge = s;
+ if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
+ continue;
+
+ down_read(&s->lock);
+ active = s->active;
+ up_read(&s->lock);
+
+ if (active) {
+ if (snap_src)
+ *snap_src = s;
+ } else if (snap_dest)
+ *snap_dest = s;
+
+ count++;
+ }
+
+out:
+ return count;
+}
+
+/*
+ * On success, returns 1 if this snapshot is a handover destination,
+ * otherwise returns 0.
+ */
+static int __validate_exception_handover(struct dm_snapshot *snap)
+{
+ struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+ struct dm_snapshot *snap_merge = NULL;
+
+ /* Does snapshot need exceptions handed over to it? */
+ if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
+ &snap_merge) == 2) ||
+ snap_dest) {
+ snap->ti->error = "Snapshot cow pairing for exception "
+ "table handover failed";
+ return -EINVAL;
+ }
+
+ /*
+ * If no snap_src was found, snap cannot become a handover
+ * destination.
+ */
+ if (!snap_src)
+ return 0;
+
+ /*
+ * Non-snapshot-merge handover?
+ */
+ if (!dm_target_is_snapshot_merge(snap->ti))
+ return 1;
+
+ /*
+ * Do not allow more than one merging snapshot.
+ */
+ if (snap_merge) {
+ snap->ti->error = "A snapshot is already merging.";
+ return -EINVAL;
+ }
+
+ if (!snap_src->store->type->prepare_merge ||
+ !snap_src->store->type->commit_merge) {
+ snap->ti->error = "Snapshot exception store does not "
+ "support snapshot-merge.";
+ return -EINVAL;
+ }
+
+ return 1;
+}
+
+static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
+{
+ struct dm_snapshot *l;
+
+ /* Sort the list according to chunk size, largest-first smallest-last */
+ list_for_each_entry(l, &o->snapshots, list)
+ if (l->store->chunk_size < s->store->chunk_size)
+ break;
+ list_add_tail(&s->list, &l->list);
+}
+
+/*
+ * Make a note of the snapshot and its origin so we can look it
+ * up when the origin has a write on it.
+ *
+ * Also validate snapshot exception store handovers.
+ * On success, returns 1 if this registration is a handover destination,
+ * otherwise returns 0.
+ */
+static int register_snapshot(struct dm_snapshot *snap)
+{
+ struct origin *o, *new_o = NULL;
+ struct block_device *bdev = snap->origin->bdev;
+ int r = 0;
+
+ new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
+ if (!new_o)
+ return -ENOMEM;
+
+ down_write(&_origins_lock);
+
+ r = __validate_exception_handover(snap);
+ if (r < 0) {
+ kfree(new_o);
+ goto out;
+ }
+
+ o = __lookup_origin(bdev);
+ if (o)
+ kfree(new_o);
+ else {
+ /* New origin */
+ o = new_o;
+
+ /* Initialise the struct */
+ INIT_LIST_HEAD(&o->snapshots);
+ o->bdev = bdev;
+
+ __insert_origin(o);
+ }
+
+ __insert_snapshot(o, snap);
+
+out:
+ up_write(&_origins_lock);
+
+ return r;
+}
+
+/*
+ * Move snapshot to correct place in list according to chunk size.
+ */
+static void reregister_snapshot(struct dm_snapshot *s)
+{
+ struct block_device *bdev = s->origin->bdev;
+
+ down_write(&_origins_lock);
+
+ list_del(&s->list);
+ __insert_snapshot(__lookup_origin(bdev), s);
+
+ up_write(&_origins_lock);
+}
+
+static void unregister_snapshot(struct dm_snapshot *s)
+{
+ struct origin *o;
+
+ down_write(&_origins_lock);
+ o = __lookup_origin(s->origin->bdev);
+
+ list_del(&s->list);
+ if (o && list_empty(&o->snapshots)) {
+ list_del(&o->hash_list);
+ kfree(o);
+ }
+
+ up_write(&_origins_lock);
+}
+
+/*
+ * Implementation of the exception hash tables.
+ * The lowest hash_shift bits of the chunk number are ignored, allowing
+ * some consecutive chunks to be grouped together.
+ */
+static int dm_exception_table_init(struct dm_exception_table *et,
+ uint32_t size, unsigned hash_shift)
+{
+ unsigned int i;
+
+ et->hash_shift = hash_shift;
+ et->hash_mask = size - 1;
+ et->table = dm_vcalloc(size, sizeof(struct list_head));
+ if (!et->table)
+ return -ENOMEM;
+
+ for (i = 0; i < size; i++)
+ INIT_LIST_HEAD(et->table + i);
+
+ return 0;
+}
+
+static void dm_exception_table_exit(struct dm_exception_table *et,
+ struct kmem_cache *mem)
+{
+ struct list_head *slot;
+ struct dm_exception *ex, *next;
+ int i, size;
+
+ size = et->hash_mask + 1;
+ for (i = 0; i < size; i++) {
+ slot = et->table + i;
+
+ list_for_each_entry_safe (ex, next, slot, hash_list)
+ kmem_cache_free(mem, ex);
+ }
+
+ vfree(et->table);
+}
+
+static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
+{
+ return (chunk >> et->hash_shift) & et->hash_mask;
+}
+
+static void dm_remove_exception(struct dm_exception *e)
+{
+ list_del(&e->hash_list);
+}
+
+/*
+ * Return the exception data for a sector, or NULL if not
+ * remapped.
+ */
+static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
+ chunk_t chunk)
+{
+ struct list_head *slot;
+ struct dm_exception *e;
+
+ slot = &et->table[exception_hash(et, chunk)];
+ list_for_each_entry (e, slot, hash_list)
+ if (chunk >= e->old_chunk &&
+ chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
+ return e;
+
+ return NULL;
+}
+
+static struct dm_exception *alloc_completed_exception(gfp_t gfp)
+{
+ struct dm_exception *e;
+
+ e = kmem_cache_alloc(exception_cache, gfp);
+ if (!e && gfp == GFP_NOIO)
+ e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
+
+ return e;
+}
+
+static void free_completed_exception(struct dm_exception *e)
+{
+ kmem_cache_free(exception_cache, e);
+}
+
+static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
+{
+ struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool,
+ GFP_NOIO);
+
+ atomic_inc(&s->pending_exceptions_count);
+ pe->snap = s;
+
+ return pe;
+}
+
+static void free_pending_exception(struct dm_snap_pending_exception *pe)
+{
+ struct dm_snapshot *s = pe->snap;
+
+ mempool_free(pe, s->pending_pool);
+ smp_mb__before_atomic();
+ atomic_dec(&s->pending_exceptions_count);
+}
+
+static void dm_insert_exception(struct dm_exception_table *eh,
+ struct dm_exception *new_e)
+{
+ struct list_head *l;
+ struct dm_exception *e = NULL;
+
+ l = &eh->table[exception_hash(eh, new_e->old_chunk)];
+
+ /* Add immediately if this table doesn't support consecutive chunks */
+ if (!eh->hash_shift)
+ goto out;
+
+ /* List is ordered by old_chunk */
+ list_for_each_entry_reverse(e, l, hash_list) {
+ /* Insert after an existing chunk? */
+ if (new_e->old_chunk == (e->old_chunk +
+ dm_consecutive_chunk_count(e) + 1) &&
+ new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
+ dm_consecutive_chunk_count(e) + 1)) {
+ dm_consecutive_chunk_count_inc(e);
+ free_completed_exception(new_e);
+ return;
+ }
+
+ /* Insert before an existing chunk? */
+ if (new_e->old_chunk == (e->old_chunk - 1) &&
+ new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
+ dm_consecutive_chunk_count_inc(e);
+ e->old_chunk--;
+ e->new_chunk--;
+ free_completed_exception(new_e);
+ return;
+ }
+
+ if (new_e->old_chunk > e->old_chunk)
+ break;
+ }
+
+out:
+ list_add(&new_e->hash_list, e ? &e->hash_list : l);
+}
+
+/*
+ * Callback used by the exception stores to load exceptions when
+ * initialising.
+ */
+static int dm_add_exception(void *context, chunk_t old, chunk_t new)
+{
+ struct dm_snapshot *s = context;
+ struct dm_exception *e;
+
+ e = alloc_completed_exception(GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+
+ e->old_chunk = old;
+
+ /* Consecutive_count is implicitly initialised to zero */
+ e->new_chunk = new;
+
+ dm_insert_exception(&s->complete, e);
+
+ return 0;
+}
+
+/*
+ * Return a minimum chunk size of all snapshots that have the specified origin.
+ * Return zero if the origin has no snapshots.
+ */
+static uint32_t __minimum_chunk_size(struct origin *o)
+{
+ struct dm_snapshot *snap;
+ unsigned chunk_size = 0;
+
+ if (o)
+ list_for_each_entry(snap, &o->snapshots, list)
+ chunk_size = min_not_zero(chunk_size,
+ snap->store->chunk_size);
+
+ return (uint32_t) chunk_size;
+}
+
+/*
+ * Hard coded magic.
+ */
+static int calc_max_buckets(void)
+{
+ /* use a fixed size of 2MB */
+ unsigned long mem = 2 * 1024 * 1024;
+ mem /= sizeof(struct list_head);
+
+ return mem;
+}
+
+/*
+ * Allocate room for a suitable hash table.
+ */
+static int init_hash_tables(struct dm_snapshot *s)
+{
+ sector_t hash_size, cow_dev_size, max_buckets;
+
+ /*
+ * Calculate based on the size of the original volume or
+ * the COW volume...
+ */
+ cow_dev_size = get_dev_size(s->cow->bdev);
+ max_buckets = calc_max_buckets();
+
+ hash_size = cow_dev_size >> s->store->chunk_shift;
+ hash_size = min(hash_size, max_buckets);
+
+ if (hash_size < 64)
+ hash_size = 64;
+ hash_size = rounddown_pow_of_two(hash_size);
+ if (dm_exception_table_init(&s->complete, hash_size,
+ DM_CHUNK_CONSECUTIVE_BITS))
+ return -ENOMEM;
+
+ /*
+ * Allocate hash table for in-flight exceptions
+ * Make this smaller than the real hash table
+ */
+ hash_size >>= 3;
+ if (hash_size < 64)
+ hash_size = 64;
+
+ if (dm_exception_table_init(&s->pending, hash_size, 0)) {
+ dm_exception_table_exit(&s->complete, exception_cache);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void merge_shutdown(struct dm_snapshot *s)
+{
+ clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
+ smp_mb__after_atomic();
+ wake_up_bit(&s->state_bits, RUNNING_MERGE);
+}
+
+static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
+{
+ s->first_merging_chunk = 0;
+ s->num_merging_chunks = 0;
+
+ return bio_list_get(&s->bios_queued_during_merge);
+}
+
+/*
+ * Remove one chunk from the index of completed exceptions.
+ */
+static int __remove_single_exception_chunk(struct dm_snapshot *s,
+ chunk_t old_chunk)
+{
+ struct dm_exception *e;
+
+ e = dm_lookup_exception(&s->complete, old_chunk);
+ if (!e) {
+ DMERR("Corruption detected: exception for block %llu is "
+ "on disk but not in memory",
+ (unsigned long long)old_chunk);
+ return -EINVAL;
+ }
+
+ /*
+ * If this is the only chunk using this exception, remove exception.
+ */
+ if (!dm_consecutive_chunk_count(e)) {
+ dm_remove_exception(e);
+ free_completed_exception(e);
+ return 0;
+ }
+
+ /*
+ * The chunk may be either at the beginning or the end of a
+ * group of consecutive chunks - never in the middle. We are
+ * removing chunks in the opposite order to that in which they
+ * were added, so this should always be true.
+ * Decrement the consecutive chunk counter and adjust the
+ * starting point if necessary.
+ */
+ if (old_chunk == e->old_chunk) {
+ e->old_chunk++;
+ e->new_chunk++;
+ } else if (old_chunk != e->old_chunk +
+ dm_consecutive_chunk_count(e)) {
+ DMERR("Attempt to merge block %llu from the "
+ "middle of a chunk range [%llu - %llu]",
+ (unsigned long long)old_chunk,
+ (unsigned long long)e->old_chunk,
+ (unsigned long long)
+ e->old_chunk + dm_consecutive_chunk_count(e));
+ return -EINVAL;
+ }
+
+ dm_consecutive_chunk_count_dec(e);
+
+ return 0;
+}
+
+static void flush_bios(struct bio *bio);
+
+static int remove_single_exception_chunk(struct dm_snapshot *s)
+{
+ struct bio *b = NULL;
+ int r;
+ chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
+
+ down_write(&s->lock);
+
+ /*
+ * Process chunks (and associated exceptions) in reverse order
+ * so that dm_consecutive_chunk_count_dec() accounting works.
+ */
+ do {
+ r = __remove_single_exception_chunk(s, old_chunk);
+ if (r)
+ goto out;
+ } while (old_chunk-- > s->first_merging_chunk);
+
+ b = __release_queued_bios_after_merge(s);
+
+out:
+ up_write(&s->lock);
+ if (b)
+ flush_bios(b);
+
+ return r;
+}
+
+static int origin_write_extent(struct dm_snapshot *merging_snap,
+ sector_t sector, unsigned chunk_size);
+
+static void merge_callback(int read_err, unsigned long write_err,
+ void *context);
+
+static uint64_t read_pending_exceptions_done_count(void)
+{
+ uint64_t pending_exceptions_done;
+
+ spin_lock(&_pending_exceptions_done_spinlock);
+ pending_exceptions_done = _pending_exceptions_done_count;
+ spin_unlock(&_pending_exceptions_done_spinlock);
+
+ return pending_exceptions_done;
+}
+
+static void increment_pending_exceptions_done_count(void)
+{
+ spin_lock(&_pending_exceptions_done_spinlock);
+ _pending_exceptions_done_count++;
+ spin_unlock(&_pending_exceptions_done_spinlock);
+
+ wake_up_all(&_pending_exceptions_done);
+}
+
+static void snapshot_merge_next_chunks(struct dm_snapshot *s)
+{
+ int i, linear_chunks;
+ chunk_t old_chunk, new_chunk;
+ struct dm_io_region src, dest;
+ sector_t io_size;
+ uint64_t previous_count;
+
+ BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
+ if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
+ goto shut;
+
+ /*
+ * valid flag never changes during merge, so no lock required.
+ */
+ if (!s->valid) {
+ DMERR("Snapshot is invalid: can't merge");
+ goto shut;
+ }
+
+ linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
+ &new_chunk);
+ if (linear_chunks <= 0) {
+ if (linear_chunks < 0) {
+ DMERR("Read error in exception store: "
+ "shutting down merge");
+ down_write(&s->lock);
+ s->merge_failed = 1;
+ up_write(&s->lock);
+ }
+ goto shut;
+ }
+
+ /* Adjust old_chunk and new_chunk to reflect start of linear region */
+ old_chunk = old_chunk + 1 - linear_chunks;
+ new_chunk = new_chunk + 1 - linear_chunks;
+
+ /*
+ * Use one (potentially large) I/O to copy all 'linear_chunks'
+ * from the exception store to the origin
+ */
+ io_size = linear_chunks * s->store->chunk_size;
+
+ dest.bdev = s->origin->bdev;
+ dest.sector = chunk_to_sector(s->store, old_chunk);
+ dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
+
+ src.bdev = s->cow->bdev;
+ src.sector = chunk_to_sector(s->store, new_chunk);
+ src.count = dest.count;
+
+ /*
+ * Reallocate any exceptions needed in other snapshots then
+ * wait for the pending exceptions to complete.
+ * Each time any pending exception (globally on the system)
+ * completes we are woken and repeat the process to find out
+ * if we can proceed. While this may not seem a particularly
+ * efficient algorithm, it is not expected to have any
+ * significant impact on performance.
+ */
+ previous_count = read_pending_exceptions_done_count();
+ while (origin_write_extent(s, dest.sector, io_size)) {
+ wait_event(_pending_exceptions_done,
+ (read_pending_exceptions_done_count() !=
+ previous_count));
+ /* Retry after the wait, until all exceptions are done. */
+ previous_count = read_pending_exceptions_done_count();
+ }
+
+ down_write(&s->lock);
+ s->first_merging_chunk = old_chunk;
+ s->num_merging_chunks = linear_chunks;
+ up_write(&s->lock);
+
+ /* Wait until writes to all 'linear_chunks' drain */
+ for (i = 0; i < linear_chunks; i++)
+ __check_for_conflicting_io(s, old_chunk + i);
+
+ dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
+ return;
+
+shut:
+ merge_shutdown(s);
+}
+
+static void error_bios(struct bio *bio);
+
+static void merge_callback(int read_err, unsigned long write_err, void *context)
+{
+ struct dm_snapshot *s = context;
+ struct bio *b = NULL;
+
+ if (read_err || write_err) {
+ if (read_err)
+ DMERR("Read error: shutting down merge.");
+ else
+ DMERR("Write error: shutting down merge.");
+ goto shut;
+ }
+
+ if (s->store->type->commit_merge(s->store,
+ s->num_merging_chunks) < 0) {
+ DMERR("Write error in exception store: shutting down merge");
+ goto shut;
+ }
+
+ if (remove_single_exception_chunk(s) < 0)
+ goto shut;
+
+ snapshot_merge_next_chunks(s);
+
+ return;
+
+shut:
+ down_write(&s->lock);
+ s->merge_failed = 1;
+ b = __release_queued_bios_after_merge(s);
+ up_write(&s->lock);
+ error_bios(b);
+
+ merge_shutdown(s);
+}
+
+static void start_merge(struct dm_snapshot *s)
+{
+ if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
+ snapshot_merge_next_chunks(s);
+}
+
+/*
+ * Stop the merging process and wait until it finishes.
+ */
+static void stop_merge(struct dm_snapshot *s)
+{
+ set_bit(SHUTDOWN_MERGE, &s->state_bits);
+ wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
+ clear_bit(SHUTDOWN_MERGE, &s->state_bits);
+}
+
+/*
+ * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
+ */
+static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct dm_snapshot *s;
+ int i;
+ int r = -EINVAL;
+ char *origin_path, *cow_path;
+ unsigned args_used, num_flush_bios = 1;
+ fmode_t origin_mode = FMODE_READ;
+
+ if (argc != 4) {
+ ti->error = "requires exactly 4 arguments";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (dm_target_is_snapshot_merge(ti)) {
+ num_flush_bios = 2;
+ origin_mode = FMODE_WRITE;
+ }
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s) {
+ ti->error = "Cannot allocate private snapshot structure";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ origin_path = argv[0];
+ argv++;
+ argc--;
+
+ r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
+ if (r) {
+ ti->error = "Cannot get origin device";
+ goto bad_origin;
+ }
+
+ cow_path = argv[0];
+ argv++;
+ argc--;
+
+ r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
+ if (r) {
+ ti->error = "Cannot get COW device";
+ goto bad_cow;
+ }
+
+ r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
+ if (r) {
+ ti->error = "Couldn't create exception store";
+ r = -EINVAL;
+ goto bad_store;
+ }
+
+ argv += args_used;
+ argc -= args_used;
+
+ s->ti = ti;
+ s->valid = 1;
+ s->active = 0;
+ atomic_set(&s->pending_exceptions_count, 0);
+ s->exception_start_sequence = 0;
+ s->exception_complete_sequence = 0;
+ INIT_LIST_HEAD(&s->out_of_order_list);
+ init_rwsem(&s->lock);
+ INIT_LIST_HEAD(&s->list);
+ spin_lock_init(&s->pe_lock);
+ s->state_bits = 0;
+ s->merge_failed = 0;
+ s->first_merging_chunk = 0;
+ s->num_merging_chunks = 0;
+ bio_list_init(&s->bios_queued_during_merge);
+
+ /* Allocate hash table for COW data */
+ if (init_hash_tables(s)) {
+ ti->error = "Unable to allocate hash table space";
+ r = -ENOMEM;
+ goto bad_hash_tables;
+ }
+
+ s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+ if (IS_ERR(s->kcopyd_client)) {
+ r = PTR_ERR(s->kcopyd_client);
+ ti->error = "Could not create kcopyd client";
+ goto bad_kcopyd;
+ }
+
+ s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
+ if (!s->pending_pool) {
+ ti->error = "Could not allocate mempool for pending exceptions";
+ r = -ENOMEM;
+ goto bad_pending_pool;
+ }
+
+ for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
+
+ spin_lock_init(&s->tracked_chunk_lock);
+
+ ti->private = s;
+ ti->num_flush_bios = num_flush_bios;
+ ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
+
+ /* Add snapshot to the list of snapshots for this origin */
+ /* Exceptions aren't triggered till snapshot_resume() is called */
+ r = register_snapshot(s);
+ if (r == -ENOMEM) {
+ ti->error = "Snapshot origin struct allocation failed";
+ goto bad_load_and_register;
+ } else if (r < 0) {
+ /* invalid handover, register_snapshot has set ti->error */
+ goto bad_load_and_register;
+ }
+
+ /*
+ * Metadata must only be loaded into one table at once, so skip this
+ * if metadata will be handed over during resume.
+ * Chunk size will be set during the handover - set it to zero to
+ * ensure it's ignored.
+ */
+ if (r > 0) {
+ s->store->chunk_size = 0;
+ return 0;
+ }
+
+ r = s->store->type->read_metadata(s->store, dm_add_exception,
+ (void *)s);
+ if (r < 0) {
+ ti->error = "Failed to read snapshot metadata";
+ goto bad_read_metadata;
+ } else if (r > 0) {
+ s->valid = 0;
+ DMWARN("Snapshot is marked invalid.");
+ }
+
+ if (!s->store->chunk_size) {
+ ti->error = "Chunk size not set";
+ goto bad_read_metadata;
+ }
+
+ r = dm_set_target_max_io_len(ti, s->store->chunk_size);
+ if (r)
+ goto bad_read_metadata;
+
+ return 0;
+
+bad_read_metadata:
+ unregister_snapshot(s);
+
+bad_load_and_register:
+ mempool_destroy(s->pending_pool);
+
+bad_pending_pool:
+ dm_kcopyd_client_destroy(s->kcopyd_client);
+
+bad_kcopyd:
+ dm_exception_table_exit(&s->pending, pending_cache);
+ dm_exception_table_exit(&s->complete, exception_cache);
+
+bad_hash_tables:
+ dm_exception_store_destroy(s->store);
+
+bad_store:
+ dm_put_device(ti, s->cow);
+
+bad_cow:
+ dm_put_device(ti, s->origin);
+
+bad_origin:
+ kfree(s);
+
+bad:
+ return r;
+}
+
+static void __free_exceptions(struct dm_snapshot *s)
+{
+ dm_kcopyd_client_destroy(s->kcopyd_client);
+ s->kcopyd_client = NULL;
+
+ dm_exception_table_exit(&s->pending, pending_cache);
+ dm_exception_table_exit(&s->complete, exception_cache);
+}
+
+static void __handover_exceptions(struct dm_snapshot *snap_src,
+ struct dm_snapshot *snap_dest)
+{
+ union {
+ struct dm_exception_table table_swap;
+ struct dm_exception_store *store_swap;
+ } u;
+
+ /*
+ * Swap all snapshot context information between the two instances.
+ */
+ u.table_swap = snap_dest->complete;
+ snap_dest->complete = snap_src->complete;
+ snap_src->complete = u.table_swap;
+
+ u.store_swap = snap_dest->store;
+ snap_dest->store = snap_src->store;
+ snap_src->store = u.store_swap;
+
+ snap_dest->store->snap = snap_dest;
+ snap_src->store->snap = snap_src;
+
+ snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
+ snap_dest->valid = snap_src->valid;
+
+ /*
+ * Set source invalid to ensure it receives no further I/O.
+ */
+ snap_src->valid = 0;
+}
+
+static void snapshot_dtr(struct dm_target *ti)
+{
+#ifdef CONFIG_DM_DEBUG
+ int i;
+#endif
+ struct dm_snapshot *s = ti->private;
+ struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+
+ down_read(&_origins_lock);
+ /* Check whether exception handover must be cancelled */
+ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
+ if (snap_src && snap_dest && (s == snap_src)) {
+ down_write(&snap_dest->lock);
+ snap_dest->valid = 0;
+ up_write(&snap_dest->lock);
+ DMERR("Cancelling snapshot handover.");
+ }
+ up_read(&_origins_lock);
+
+ if (dm_target_is_snapshot_merge(ti))
+ stop_merge(s);
+
+ /* Prevent further origin writes from using this snapshot. */
+ /* After this returns there can be no new kcopyd jobs. */
+ unregister_snapshot(s);
+
+ while (atomic_read(&s->pending_exceptions_count))
+ msleep(1);
+ /*
+ * Ensure instructions in mempool_destroy aren't reordered
+ * before atomic_read.
+ */
+ smp_mb();
+
+#ifdef CONFIG_DM_DEBUG
+ for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
+ BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
+#endif
+
+ __free_exceptions(s);
+
+ mempool_destroy(s->pending_pool);
+
+ dm_exception_store_destroy(s->store);
+
+ dm_put_device(ti, s->cow);
+
+ dm_put_device(ti, s->origin);
+
+ kfree(s);
+}
+
+/*
+ * Flush a list of buffers.
+ */
+static void flush_bios(struct bio *bio)
+{
+ struct bio *n;
+
+ while (bio) {
+ n = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = n;
+ }
+}
+
+static int do_origin(struct dm_dev *origin, struct bio *bio);
+
+/*
+ * Flush a list of buffers.
+ */
+static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
+{
+ struct bio *n;
+ int r;
+
+ while (bio) {
+ n = bio->bi_next;
+ bio->bi_next = NULL;
+ r = do_origin(s->origin, bio);
+ if (r == DM_MAPIO_REMAPPED)
+ generic_make_request(bio);
+ bio = n;
+ }
+}
+
+/*
+ * Error a list of buffers.
+ */
+static void error_bios(struct bio *bio)
+{
+ struct bio *n;
+
+ while (bio) {
+ n = bio->bi_next;
+ bio->bi_next = NULL;
+ bio_io_error(bio);
+ bio = n;
+ }
+}
+
+static void __invalidate_snapshot(struct dm_snapshot *s, int err)
+{
+ if (!s->valid)
+ return;
+
+ if (err == -EIO)
+ DMERR("Invalidating snapshot: Error reading/writing.");
+ else if (err == -ENOMEM)
+ DMERR("Invalidating snapshot: Unable to allocate exception.");
+
+ if (s->store->type->drop_snapshot)
+ s->store->type->drop_snapshot(s->store);
+
+ s->valid = 0;
+
+ dm_table_event(s->ti->table);
+}
+
+static void pending_complete(struct dm_snap_pending_exception *pe, int success)
+{
+ struct dm_exception *e;
+ struct dm_snapshot *s = pe->snap;
+ struct bio *origin_bios = NULL;
+ struct bio *snapshot_bios = NULL;
+ struct bio *full_bio = NULL;
+ int error = 0;
+
+ if (!success) {
+ /* Read/write error - snapshot is unusable */
+ down_write(&s->lock);
+ __invalidate_snapshot(s, -EIO);
+ error = 1;
+ goto out;
+ }
+
+ e = alloc_completed_exception(GFP_NOIO);
+ if (!e) {
+ down_write(&s->lock);
+ __invalidate_snapshot(s, -ENOMEM);
+ error = 1;
+ goto out;
+ }
+ *e = pe->e;
+
+ down_write(&s->lock);
+ if (!s->valid) {
+ free_completed_exception(e);
+ error = 1;
+ goto out;
+ }
+
+ /* Check for conflicting reads */
+ __check_for_conflicting_io(s, pe->e.old_chunk);
+
+ /*
+ * Add a proper exception, and remove the
+ * in-flight exception from the list.
+ */
+ dm_insert_exception(&s->complete, e);
+
+out:
+ dm_remove_exception(&pe->e);
+ snapshot_bios = bio_list_get(&pe->snapshot_bios);
+ origin_bios = bio_list_get(&pe->origin_bios);
+ full_bio = pe->full_bio;
+ if (full_bio) {
+ full_bio->bi_end_io = pe->full_bio_end_io;
+ full_bio->bi_private = pe->full_bio_private;
+ atomic_inc(&full_bio->bi_remaining);
+ }
+ increment_pending_exceptions_done_count();
+
+ up_write(&s->lock);
+
+ /* Submit any pending write bios */
+ if (error) {
+ if (full_bio)
+ bio_io_error(full_bio);
+ error_bios(snapshot_bios);
+ } else {
+ if (full_bio)
+ bio_endio(full_bio, 0);
+ flush_bios(snapshot_bios);
+ }
+
+ retry_origin_bios(s, origin_bios);
+
+ free_pending_exception(pe);
+}
+
+static void commit_callback(void *context, int success)
+{
+ struct dm_snap_pending_exception *pe = context;
+
+ pending_complete(pe, success);
+}
+
+static void complete_exception(struct dm_snap_pending_exception *pe)
+{
+ struct dm_snapshot *s = pe->snap;
+
+ if (unlikely(pe->copy_error))
+ pending_complete(pe, 0);
+
+ else
+ /* Update the metadata if we are persistent */
+ s->store->type->commit_exception(s->store, &pe->e,
+ commit_callback, pe);
+}
+
+/*
+ * Called when the copy I/O has finished. kcopyd actually runs
+ * this code so don't block.
+ */
+static void copy_callback(int read_err, unsigned long write_err, void *context)
+{
+ struct dm_snap_pending_exception *pe = context;
+ struct dm_snapshot *s = pe->snap;
+
+ pe->copy_error = read_err || write_err;
+
+ if (pe->exception_sequence == s->exception_complete_sequence) {
+ s->exception_complete_sequence++;
+ complete_exception(pe);
+
+ while (!list_empty(&s->out_of_order_list)) {
+ pe = list_entry(s->out_of_order_list.next,
+ struct dm_snap_pending_exception, out_of_order_entry);
+ if (pe->exception_sequence != s->exception_complete_sequence)
+ break;
+ s->exception_complete_sequence++;
+ list_del(&pe->out_of_order_entry);
+ complete_exception(pe);
+ }
+ } else {
+ struct list_head *lh;
+ struct dm_snap_pending_exception *pe2;
+
+ list_for_each_prev(lh, &s->out_of_order_list) {
+ pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
+ if (pe2->exception_sequence < pe->exception_sequence)
+ break;
+ }
+ list_add(&pe->out_of_order_entry, lh);
+ }
+}
+
+/*
+ * Dispatches the copy operation to kcopyd.
+ */
+static void start_copy(struct dm_snap_pending_exception *pe)
+{
+ struct dm_snapshot *s = pe->snap;
+ struct dm_io_region src, dest;
+ struct block_device *bdev = s->origin->bdev;
+ sector_t dev_size;
+
+ dev_size = get_dev_size(bdev);
+
+ src.bdev = bdev;
+ src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
+ src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
+
+ dest.bdev = s->cow->bdev;
+ dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
+ dest.count = src.count;
+
+ /* Hand over to kcopyd */
+ dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
+}
+
+static void full_bio_end_io(struct bio *bio, int error)
+{
+ void *callback_data = bio->bi_private;
+
+ dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
+}
+
+static void start_full_bio(struct dm_snap_pending_exception *pe,
+ struct bio *bio)
+{
+ struct dm_snapshot *s = pe->snap;
+ void *callback_data;
+
+ pe->full_bio = bio;
+ pe->full_bio_end_io = bio->bi_end_io;
+ pe->full_bio_private = bio->bi_private;
+
+ callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
+ copy_callback, pe);
+
+ bio->bi_end_io = full_bio_end_io;
+ bio->bi_private = callback_data;
+
+ generic_make_request(bio);
+}
+
+static struct dm_snap_pending_exception *
+__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
+{
+ struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
+
+ if (!e)
+ return NULL;
+
+ return container_of(e, struct dm_snap_pending_exception, e);
+}
+
+/*
+ * Looks to see if this snapshot already has a pending exception
+ * for this chunk, otherwise it allocates a new one and inserts
+ * it into the pending table.
+ *
+ * NOTE: a write lock must be held on snap->lock before calling
+ * this.
+ */
+static struct dm_snap_pending_exception *
+__find_pending_exception(struct dm_snapshot *s,
+ struct dm_snap_pending_exception *pe, chunk_t chunk)
+{
+ struct dm_snap_pending_exception *pe2;
+
+ pe2 = __lookup_pending_exception(s, chunk);
+ if (pe2) {
+ free_pending_exception(pe);
+ return pe2;
+ }
+
+ pe->e.old_chunk = chunk;
+ bio_list_init(&pe->origin_bios);
+ bio_list_init(&pe->snapshot_bios);
+ pe->started = 0;
+ pe->full_bio = NULL;
+
+ if (s->store->type->prepare_exception(s->store, &pe->e)) {
+ free_pending_exception(pe);
+ return NULL;
+ }
+
+ pe->exception_sequence = s->exception_start_sequence++;
+
+ dm_insert_exception(&s->pending, &pe->e);
+
+ return pe;
+}
+
+static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
+ struct bio *bio, chunk_t chunk)
+{
+ bio->bi_bdev = s->cow->bdev;
+ bio->bi_iter.bi_sector =
+ chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
+ (chunk - e->old_chunk)) +
+ (bio->bi_iter.bi_sector & s->store->chunk_mask);
+}
+
+static int snapshot_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dm_exception *e;
+ struct dm_snapshot *s = ti->private;
+ int r = DM_MAPIO_REMAPPED;
+ chunk_t chunk;
+ struct dm_snap_pending_exception *pe = NULL;
+
+ init_tracked_chunk(bio);
+
+ if (bio->bi_rw & REQ_FLUSH) {
+ bio->bi_bdev = s->cow->bdev;
+ return DM_MAPIO_REMAPPED;
+ }
+
+ chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
+
+ /* Full snapshots are not usable */
+ /* To get here the table must be live so s->active is always set. */
+ if (!s->valid)
+ return -EIO;
+
+ /* FIXME: should only take write lock if we need
+ * to copy an exception */
+ down_write(&s->lock);
+
+ if (!s->valid) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /* If the block is already remapped - use that, else remap it */
+ e = dm_lookup_exception(&s->complete, chunk);
+ if (e) {
+ remap_exception(s, e, bio, chunk);
+ goto out_unlock;
+ }
+
+ /*
+ * Write to snapshot - higher level takes care of RW/RO
+ * flags so we should only get this if we are
+ * writeable.
+ */
+ if (bio_rw(bio) == WRITE) {
+ pe = __lookup_pending_exception(s, chunk);
+ if (!pe) {
+ up_write(&s->lock);
+ pe = alloc_pending_exception(s);
+ down_write(&s->lock);
+
+ if (!s->valid) {
+ free_pending_exception(pe);
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ e = dm_lookup_exception(&s->complete, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ remap_exception(s, e, bio, chunk);
+ goto out_unlock;
+ }
+
+ pe = __find_pending_exception(s, pe, chunk);
+ if (!pe) {
+ __invalidate_snapshot(s, -ENOMEM);
+ r = -EIO;
+ goto out_unlock;
+ }
+ }
+
+ remap_exception(s, &pe->e, bio, chunk);
+
+ r = DM_MAPIO_SUBMITTED;
+
+ if (!pe->started &&
+ bio->bi_iter.bi_size ==
+ (s->store->chunk_size << SECTOR_SHIFT)) {
+ pe->started = 1;
+ up_write(&s->lock);
+ start_full_bio(pe, bio);
+ goto out;
+ }
+
+ bio_list_add(&pe->snapshot_bios, bio);
+
+ if (!pe->started) {
+ /* this is protected by snap->lock */
+ pe->started = 1;
+ up_write(&s->lock);
+ start_copy(pe);
+ goto out;
+ }
+ } else {
+ bio->bi_bdev = s->origin->bdev;
+ track_chunk(s, bio, chunk);
+ }
+
+out_unlock:
+ up_write(&s->lock);
+out:
+ return r;
+}
+
+/*
+ * A snapshot-merge target behaves like a combination of a snapshot
+ * target and a snapshot-origin target. It only generates new
+ * exceptions in other snapshots and not in the one that is being
+ * merged.
+ *
+ * For each chunk, if there is an existing exception, it is used to
+ * redirect I/O to the cow device. Otherwise I/O is sent to the origin,
+ * which in turn might generate exceptions in other snapshots.
+ * If merging is currently taking place on the chunk in question, the
+ * I/O is deferred by adding it to s->bios_queued_during_merge.
+ */
+static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dm_exception *e;
+ struct dm_snapshot *s = ti->private;
+ int r = DM_MAPIO_REMAPPED;
+ chunk_t chunk;
+
+ init_tracked_chunk(bio);
+
+ if (bio->bi_rw & REQ_FLUSH) {
+ if (!dm_bio_get_target_bio_nr(bio))
+ bio->bi_bdev = s->origin->bdev;
+ else
+ bio->bi_bdev = s->cow->bdev;
+ return DM_MAPIO_REMAPPED;
+ }
+
+ chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
+
+ down_write(&s->lock);
+
+ /* Full merging snapshots are redirected to the origin */
+ if (!s->valid)
+ goto redirect_to_origin;
+
+ /* If the block is already remapped - use that */
+ e = dm_lookup_exception(&s->complete, chunk);
+ if (e) {
+ /* Queue writes overlapping with chunks being merged */
+ if (bio_rw(bio) == WRITE &&
+ chunk >= s->first_merging_chunk &&
+ chunk < (s->first_merging_chunk +
+ s->num_merging_chunks)) {
+ bio->bi_bdev = s->origin->bdev;
+ bio_list_add(&s->bios_queued_during_merge, bio);
+ r = DM_MAPIO_SUBMITTED;
+ goto out_unlock;
+ }
+
+ remap_exception(s, e, bio, chunk);
+
+ if (bio_rw(bio) == WRITE)
+ track_chunk(s, bio, chunk);
+ goto out_unlock;
+ }
+
+redirect_to_origin:
+ bio->bi_bdev = s->origin->bdev;
+
+ if (bio_rw(bio) == WRITE) {
+ up_write(&s->lock);
+ return do_origin(s->origin, bio);
+ }
+
+out_unlock:
+ up_write(&s->lock);
+
+ return r;
+}
+
+static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+ struct dm_snapshot *s = ti->private;
+
+ if (is_bio_tracked(bio))
+ stop_tracking_chunk(s, bio);
+
+ return 0;
+}
+
+static void snapshot_merge_presuspend(struct dm_target *ti)
+{
+ struct dm_snapshot *s = ti->private;
+
+ stop_merge(s);
+}
+
+static int snapshot_preresume(struct dm_target *ti)
+{
+ int r = 0;
+ struct dm_snapshot *s = ti->private;
+ struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+
+ down_read(&_origins_lock);
+ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
+ if (snap_src && snap_dest) {
+ down_read(&snap_src->lock);
+ if (s == snap_src) {
+ DMERR("Unable to resume snapshot source until "
+ "handover completes.");
+ r = -EINVAL;
+ } else if (!dm_suspended(snap_src->ti)) {
+ DMERR("Unable to perform snapshot handover until "
+ "source is suspended.");
+ r = -EINVAL;
+ }
+ up_read(&snap_src->lock);
+ }
+ up_read(&_origins_lock);
+
+ return r;
+}
+
+static void snapshot_resume(struct dm_target *ti)
+{
+ struct dm_snapshot *s = ti->private;
+ struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL;
+ struct dm_origin *o;
+ struct mapped_device *origin_md = NULL;
+ bool must_restart_merging = false;
+
+ down_read(&_origins_lock);
+
+ o = __lookup_dm_origin(s->origin->bdev);
+ if (o)
+ origin_md = dm_table_get_md(o->ti->table);
+ if (!origin_md) {
+ (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
+ if (snap_merging)
+ origin_md = dm_table_get_md(snap_merging->ti->table);
+ }
+ if (origin_md == dm_table_get_md(ti->table))
+ origin_md = NULL;
+ if (origin_md) {
+ if (dm_hold(origin_md))
+ origin_md = NULL;
+ }
+
+ up_read(&_origins_lock);
+
+ if (origin_md) {
+ dm_internal_suspend_fast(origin_md);
+ if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
+ must_restart_merging = true;
+ stop_merge(snap_merging);
+ }
+ }
+
+ down_read(&_origins_lock);
+
+ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
+ if (snap_src && snap_dest) {
+ down_write(&snap_src->lock);
+ down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+ __handover_exceptions(snap_src, snap_dest);
+ up_write(&snap_dest->lock);
+ up_write(&snap_src->lock);
+ }
+
+ up_read(&_origins_lock);
+
+ if (origin_md) {
+ if (must_restart_merging)
+ start_merge(snap_merging);
+ dm_internal_resume_fast(origin_md);
+ dm_put(origin_md);
+ }
+
+ /* Now we have correct chunk size, reregister */
+ reregister_snapshot(s);
+
+ down_write(&s->lock);
+ s->active = 1;
+ up_write(&s->lock);
+}
+
+static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
+{
+ uint32_t min_chunksize;
+
+ down_read(&_origins_lock);
+ min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
+ up_read(&_origins_lock);
+
+ return min_chunksize;
+}
+
+static void snapshot_merge_resume(struct dm_target *ti)
+{
+ struct dm_snapshot *s = ti->private;
+
+ /*
+ * Handover exceptions from existing snapshot.
+ */
+ snapshot_resume(ti);
+
+ /*
+ * snapshot-merge acts as an origin, so set ti->max_io_len
+ */
+ ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
+
+ start_merge(s);
+}
+
+static void snapshot_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ unsigned sz = 0;
+ struct dm_snapshot *snap = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+
+ down_write(&snap->lock);
+
+ if (!snap->valid)
+ DMEMIT("Invalid");
+ else if (snap->merge_failed)
+ DMEMIT("Merge failed");
+ else {
+ if (snap->store->type->usage) {
+ sector_t total_sectors, sectors_allocated,
+ metadata_sectors;
+ snap->store->type->usage(snap->store,
+ &total_sectors,
+ &sectors_allocated,
+ &metadata_sectors);
+ DMEMIT("%llu/%llu %llu",
+ (unsigned long long)sectors_allocated,
+ (unsigned long long)total_sectors,
+ (unsigned long long)metadata_sectors);
+ }
+ else
+ DMEMIT("Unknown");
+ }
+
+ up_write(&snap->lock);
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ /*
+ * kdevname returns a static pointer so we need
+ * to make private copies if the output is to
+ * make sense.
+ */
+ DMEMIT("%s %s", snap->origin->name, snap->cow->name);
+ snap->store->type->status(snap->store, type, result + sz,
+ maxlen - sz);
+ break;
+ }
+}
+
+static int snapshot_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dm_snapshot *snap = ti->private;
+ int r;
+
+ r = fn(ti, snap->origin, 0, ti->len, data);
+
+ if (!r)
+ r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
+
+ return r;
+}
+
+
+/*-----------------------------------------------------------------
+ * Origin methods
+ *---------------------------------------------------------------*/
+
+/*
+ * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
+ * supplied bio was ignored. The caller may submit it immediately.
+ * (No remapping actually occurs as the origin is always a direct linear
+ * map.)
+ *
+ * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
+ * and any supplied bio is added to a list to be submitted once all
+ * the necessary exceptions exist.
+ */
+static int __origin_write(struct list_head *snapshots, sector_t sector,
+ struct bio *bio)
+{
+ int r = DM_MAPIO_REMAPPED;
+ struct dm_snapshot *snap;
+ struct dm_exception *e;
+ struct dm_snap_pending_exception *pe;
+ struct dm_snap_pending_exception *pe_to_start_now = NULL;
+ struct dm_snap_pending_exception *pe_to_start_last = NULL;
+ chunk_t chunk;
+
+ /* Do all the snapshots on this origin */
+ list_for_each_entry (snap, snapshots, list) {
+ /*
+ * Don't make new exceptions in a merging snapshot
+ * because it has effectively been deleted
+ */
+ if (dm_target_is_snapshot_merge(snap->ti))
+ continue;
+
+ down_write(&snap->lock);
+
+ /* Only deal with valid and active snapshots */
+ if (!snap->valid || !snap->active)
+ goto next_snapshot;
+
+ /* Nothing to do if writing beyond end of snapshot */
+ if (sector >= dm_table_get_size(snap->ti->table))
+ goto next_snapshot;
+
+ /*
+ * Remember, different snapshots can have
+ * different chunk sizes.
+ */
+ chunk = sector_to_chunk(snap->store, sector);
+
+ /*
+ * Check exception table to see if block
+ * is already remapped in this snapshot
+ * and trigger an exception if not.
+ */
+ e = dm_lookup_exception(&snap->complete, chunk);
+ if (e)
+ goto next_snapshot;
+
+ pe = __lookup_pending_exception(snap, chunk);
+ if (!pe) {
+ up_write(&snap->lock);
+ pe = alloc_pending_exception(snap);
+ down_write(&snap->lock);
+
+ if (!snap->valid) {
+ free_pending_exception(pe);
+ goto next_snapshot;
+ }
+
+ e = dm_lookup_exception(&snap->complete, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ goto next_snapshot;
+ }
+
+ pe = __find_pending_exception(snap, pe, chunk);
+ if (!pe) {
+ __invalidate_snapshot(snap, -ENOMEM);
+ goto next_snapshot;
+ }
+ }
+
+ r = DM_MAPIO_SUBMITTED;
+
+ /*
+ * If an origin bio was supplied, queue it to wait for the
+ * completion of this exception, and start this one last,
+ * at the end of the function.
+ */
+ if (bio) {
+ bio_list_add(&pe->origin_bios, bio);
+ bio = NULL;
+
+ if (!pe->started) {
+ pe->started = 1;
+ pe_to_start_last = pe;
+ }
+ }
+
+ if (!pe->started) {
+ pe->started = 1;
+ pe_to_start_now = pe;
+ }
+
+next_snapshot:
+ up_write(&snap->lock);
+
+ if (pe_to_start_now) {
+ start_copy(pe_to_start_now);
+ pe_to_start_now = NULL;
+ }
+ }
+
+ /*
+ * Submit the exception against which the bio is queued last,
+ * to give the other exceptions a head start.
+ */
+ if (pe_to_start_last)
+ start_copy(pe_to_start_last);
+
+ return r;
+}
+
+/*
+ * Called on a write from the origin driver.
+ */
+static int do_origin(struct dm_dev *origin, struct bio *bio)
+{
+ struct origin *o;
+ int r = DM_MAPIO_REMAPPED;
+
+ down_read(&_origins_lock);
+ o = __lookup_origin(origin->bdev);
+ if (o)
+ r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
+ up_read(&_origins_lock);
+
+ return r;
+}
+
+/*
+ * Trigger exceptions in all non-merging snapshots.
+ *
+ * The chunk size of the merging snapshot may be larger than the chunk
+ * size of some other snapshot so we may need to reallocate multiple
+ * chunks in other snapshots.
+ *
+ * We scan all the overlapping exceptions in the other snapshots.
+ * Returns 1 if anything was reallocated and must be waited for,
+ * otherwise returns 0.
+ *
+ * size must be a multiple of merging_snap's chunk_size.
+ */
+static int origin_write_extent(struct dm_snapshot *merging_snap,
+ sector_t sector, unsigned size)
+{
+ int must_wait = 0;
+ sector_t n;
+ struct origin *o;
+
+ /*
+ * The origin's __minimum_chunk_size() got stored in max_io_len
+ * by snapshot_merge_resume().
+ */
+ down_read(&_origins_lock);
+ o = __lookup_origin(merging_snap->origin->bdev);
+ for (n = 0; n < size; n += merging_snap->ti->max_io_len)
+ if (__origin_write(&o->snapshots, sector + n, NULL) ==
+ DM_MAPIO_SUBMITTED)
+ must_wait = 1;
+ up_read(&_origins_lock);
+
+ return must_wait;
+}
+
+/*
+ * Origin: maps a linear range of a device, with hooks for snapshotting.
+ */
+
+/*
+ * Construct an origin mapping: <dev_path>
+ * The context for an origin is merely a 'struct dm_dev *'
+ * pointing to the real device.
+ */
+static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ int r;
+ struct dm_origin *o;
+
+ if (argc != 1) {
+ ti->error = "origin: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
+ if (!o) {
+ ti->error = "Cannot allocate private origin structure";
+ r = -ENOMEM;
+ goto bad_alloc;
+ }
+
+ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
+ if (r) {
+ ti->error = "Cannot get target device";
+ goto bad_open;
+ }
+
+ o->ti = ti;
+ ti->private = o;
+ ti->num_flush_bios = 1;
+
+ return 0;
+
+bad_open:
+ kfree(o);
+bad_alloc:
+ return r;
+}
+
+static void origin_dtr(struct dm_target *ti)
+{
+ struct dm_origin *o = ti->private;
+
+ dm_put_device(ti, o->dev);
+ kfree(o);
+}
+
+static int origin_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dm_origin *o = ti->private;
+ unsigned available_sectors;
+
+ bio->bi_bdev = o->dev->bdev;
+
+ if (unlikely(bio->bi_rw & REQ_FLUSH))
+ return DM_MAPIO_REMAPPED;
+
+ if (bio_rw(bio) != WRITE)
+ return DM_MAPIO_REMAPPED;
+
+ available_sectors = o->split_boundary -
+ ((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
+
+ if (bio_sectors(bio) > available_sectors)
+ dm_accept_partial_bio(bio, available_sectors);
+
+ /* Only tell snapshots if this is a write */
+ return do_origin(o->dev, bio);
+}
+
+/*
+ * Set the target "max_io_len" field to the minimum of all the snapshots'
+ * chunk sizes.
+ */
+static void origin_resume(struct dm_target *ti)
+{
+ struct dm_origin *o = ti->private;
+
+ o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
+
+ down_write(&_origins_lock);
+ __insert_dm_origin(o);
+ up_write(&_origins_lock);
+}
+
+static void origin_postsuspend(struct dm_target *ti)
+{
+ struct dm_origin *o = ti->private;
+
+ down_write(&_origins_lock);
+ __remove_dm_origin(o);
+ up_write(&_origins_lock);
+}
+
+static void origin_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct dm_origin *o = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ snprintf(result, maxlen, "%s", o->dev->name);
+ break;
+ }
+}
+
+static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct dm_origin *o = ti->private;
+ struct request_queue *q = bdev_get_queue(o->dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = o->dev->bdev;
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int origin_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dm_origin *o = ti->private;
+
+ return fn(ti, o->dev, 0, ti->len, data);
+}
+
+static struct target_type origin_target = {
+ .name = "snapshot-origin",
+ .version = {1, 9, 0},
+ .module = THIS_MODULE,
+ .ctr = origin_ctr,
+ .dtr = origin_dtr,
+ .map = origin_map,
+ .resume = origin_resume,
+ .postsuspend = origin_postsuspend,
+ .status = origin_status,
+ .merge = origin_merge,
+ .iterate_devices = origin_iterate_devices,
+};
+
+static struct target_type snapshot_target = {
+ .name = "snapshot",
+ .version = {1, 13, 0},
+ .module = THIS_MODULE,
+ .ctr = snapshot_ctr,
+ .dtr = snapshot_dtr,
+ .map = snapshot_map,
+ .end_io = snapshot_end_io,
+ .preresume = snapshot_preresume,
+ .resume = snapshot_resume,
+ .status = snapshot_status,
+ .iterate_devices = snapshot_iterate_devices,
+};
+
+static struct target_type merge_target = {
+ .name = dm_snapshot_merge_target_name,
+ .version = {1, 3, 0},
+ .module = THIS_MODULE,
+ .ctr = snapshot_ctr,
+ .dtr = snapshot_dtr,
+ .map = snapshot_merge_map,
+ .end_io = snapshot_end_io,
+ .presuspend = snapshot_merge_presuspend,
+ .preresume = snapshot_preresume,
+ .resume = snapshot_merge_resume,
+ .status = snapshot_status,
+ .iterate_devices = snapshot_iterate_devices,
+};
+
+static int __init dm_snapshot_init(void)
+{
+ int r;
+
+ r = dm_exception_store_init();
+ if (r) {
+ DMERR("Failed to initialize exception stores");
+ return r;
+ }
+
+ r = dm_register_target(&snapshot_target);
+ if (r < 0) {
+ DMERR("snapshot target register failed %d", r);
+ goto bad_register_snapshot_target;
+ }
+
+ r = dm_register_target(&origin_target);
+ if (r < 0) {
+ DMERR("Origin target register failed %d", r);
+ goto bad_register_origin_target;
+ }
+
+ r = dm_register_target(&merge_target);
+ if (r < 0) {
+ DMERR("Merge target register failed %d", r);
+ goto bad_register_merge_target;
+ }
+
+ r = init_origin_hash();
+ if (r) {
+ DMERR("init_origin_hash failed.");
+ goto bad_origin_hash;
+ }
+
+ exception_cache = KMEM_CACHE(dm_exception, 0);
+ if (!exception_cache) {
+ DMERR("Couldn't create exception cache.");
+ r = -ENOMEM;
+ goto bad_exception_cache;
+ }
+
+ pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
+ if (!pending_cache) {
+ DMERR("Couldn't create pending cache.");
+ r = -ENOMEM;
+ goto bad_pending_cache;
+ }
+
+ return 0;
+
+bad_pending_cache:
+ kmem_cache_destroy(exception_cache);
+bad_exception_cache:
+ exit_origin_hash();
+bad_origin_hash:
+ dm_unregister_target(&merge_target);
+bad_register_merge_target:
+ dm_unregister_target(&origin_target);
+bad_register_origin_target:
+ dm_unregister_target(&snapshot_target);
+bad_register_snapshot_target:
+ dm_exception_store_exit();
+
+ return r;
+}
+
+static void __exit dm_snapshot_exit(void)
+{
+ dm_unregister_target(&snapshot_target);
+ dm_unregister_target(&origin_target);
+ dm_unregister_target(&merge_target);
+
+ exit_origin_hash();
+ kmem_cache_destroy(pending_cache);
+ kmem_cache_destroy(exception_cache);
+
+ dm_exception_store_exit();
+}
+
+/* Module hooks */
+module_init(dm_snapshot_init);
+module_exit(dm_snapshot_exit);
+
+MODULE_DESCRIPTION(DM_NAME " snapshot target");
+MODULE_AUTHOR("Joe Thornber");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("dm-snapshot-origin");
+MODULE_ALIAS("dm-snapshot-merge");
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
new file mode 100644
index 000000000..419bdd4fc
--- /dev/null
+++ b/drivers/md/dm-stats.c
@@ -0,0 +1,983 @@
+#include <linux/errno.h>
+#include <linux/numa.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/threads.h>
+#include <linux/preempt.h>
+#include <linux/irqflags.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+
+#include "dm.h"
+#include "dm-stats.h"
+
+#define DM_MSG_PREFIX "stats"
+
+static int dm_stat_need_rcu_barrier;
+
+/*
+ * Using 64-bit values to avoid overflow (which is a
+ * problem that block/genhd.c's IO accounting has).
+ */
+struct dm_stat_percpu {
+ unsigned long long sectors[2];
+ unsigned long long ios[2];
+ unsigned long long merges[2];
+ unsigned long long ticks[2];
+ unsigned long long io_ticks[2];
+ unsigned long long io_ticks_total;
+ unsigned long long time_in_queue;
+};
+
+struct dm_stat_shared {
+ atomic_t in_flight[2];
+ unsigned long stamp;
+ struct dm_stat_percpu tmp;
+};
+
+struct dm_stat {
+ struct list_head list_entry;
+ int id;
+ size_t n_entries;
+ sector_t start;
+ sector_t end;
+ sector_t step;
+ const char *program_id;
+ const char *aux_data;
+ struct rcu_head rcu_head;
+ size_t shared_alloc_size;
+ size_t percpu_alloc_size;
+ struct dm_stat_percpu *stat_percpu[NR_CPUS];
+ struct dm_stat_shared stat_shared[0];
+};
+
+struct dm_stats_last_position {
+ sector_t last_sector;
+ unsigned last_rw;
+};
+
+/*
+ * A typo on the command line could possibly make the kernel run out of memory
+ * and crash. To prevent the crash we account all used memory. We fail if we
+ * exhaust 1/4 of all memory or 1/2 of vmalloc space.
+ */
+#define DM_STATS_MEMORY_FACTOR 4
+#define DM_STATS_VMALLOC_FACTOR 2
+
+static DEFINE_SPINLOCK(shared_memory_lock);
+
+static unsigned long shared_memory_amount;
+
+static bool __check_shared_memory(size_t alloc_size)
+{
+ size_t a;
+
+ a = shared_memory_amount + alloc_size;
+ if (a < shared_memory_amount)
+ return false;
+ if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
+ return false;
+#ifdef CONFIG_MMU
+ if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
+ return false;
+#endif
+ return true;
+}
+
+static bool check_shared_memory(size_t alloc_size)
+{
+ bool ret;
+
+ spin_lock_irq(&shared_memory_lock);
+
+ ret = __check_shared_memory(alloc_size);
+
+ spin_unlock_irq(&shared_memory_lock);
+
+ return ret;
+}
+
+static bool claim_shared_memory(size_t alloc_size)
+{
+ spin_lock_irq(&shared_memory_lock);
+
+ if (!__check_shared_memory(alloc_size)) {
+ spin_unlock_irq(&shared_memory_lock);
+ return false;
+ }
+
+ shared_memory_amount += alloc_size;
+
+ spin_unlock_irq(&shared_memory_lock);
+
+ return true;
+}
+
+static void free_shared_memory(size_t alloc_size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&shared_memory_lock, flags);
+
+ if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
+ spin_unlock_irqrestore(&shared_memory_lock, flags);
+ DMCRIT("Memory usage accounting bug.");
+ return;
+ }
+
+ shared_memory_amount -= alloc_size;
+
+ spin_unlock_irqrestore(&shared_memory_lock, flags);
+}
+
+static void *dm_kvzalloc(size_t alloc_size, int node)
+{
+ void *p;
+
+ if (!claim_shared_memory(alloc_size))
+ return NULL;
+
+ if (alloc_size <= KMALLOC_MAX_SIZE) {
+ p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
+ if (p)
+ return p;
+ }
+ p = vzalloc_node(alloc_size, node);
+ if (p)
+ return p;
+
+ free_shared_memory(alloc_size);
+
+ return NULL;
+}
+
+static void dm_kvfree(void *ptr, size_t alloc_size)
+{
+ if (!ptr)
+ return;
+
+ free_shared_memory(alloc_size);
+
+ if (is_vmalloc_addr(ptr))
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
+static void dm_stat_free(struct rcu_head *head)
+{
+ int cpu;
+ struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
+
+ kfree(s->program_id);
+ kfree(s->aux_data);
+ for_each_possible_cpu(cpu)
+ dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+ dm_kvfree(s, s->shared_alloc_size);
+}
+
+static int dm_stat_in_flight(struct dm_stat_shared *shared)
+{
+ return atomic_read(&shared->in_flight[READ]) +
+ atomic_read(&shared->in_flight[WRITE]);
+}
+
+void dm_stats_init(struct dm_stats *stats)
+{
+ int cpu;
+ struct dm_stats_last_position *last;
+
+ mutex_init(&stats->mutex);
+ INIT_LIST_HEAD(&stats->list);
+ stats->last = alloc_percpu(struct dm_stats_last_position);
+ for_each_possible_cpu(cpu) {
+ last = per_cpu_ptr(stats->last, cpu);
+ last->last_sector = (sector_t)ULLONG_MAX;
+ last->last_rw = UINT_MAX;
+ }
+}
+
+void dm_stats_cleanup(struct dm_stats *stats)
+{
+ size_t ni;
+ struct dm_stat *s;
+ struct dm_stat_shared *shared;
+
+ while (!list_empty(&stats->list)) {
+ s = container_of(stats->list.next, struct dm_stat, list_entry);
+ list_del(&s->list_entry);
+ for (ni = 0; ni < s->n_entries; ni++) {
+ shared = &s->stat_shared[ni];
+ if (WARN_ON(dm_stat_in_flight(shared))) {
+ DMCRIT("leaked in-flight counter at index %lu "
+ "(start %llu, end %llu, step %llu): reads %d, writes %d",
+ (unsigned long)ni,
+ (unsigned long long)s->start,
+ (unsigned long long)s->end,
+ (unsigned long long)s->step,
+ atomic_read(&shared->in_flight[READ]),
+ atomic_read(&shared->in_flight[WRITE]));
+ }
+ }
+ dm_stat_free(&s->rcu_head);
+ }
+ free_percpu(stats->last);
+}
+
+static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
+ sector_t step, const char *program_id, const char *aux_data,
+ void (*suspend_callback)(struct mapped_device *),
+ void (*resume_callback)(struct mapped_device *),
+ struct mapped_device *md)
+{
+ struct list_head *l;
+ struct dm_stat *s, *tmp_s;
+ sector_t n_entries;
+ size_t ni;
+ size_t shared_alloc_size;
+ size_t percpu_alloc_size;
+ struct dm_stat_percpu *p;
+ int cpu;
+ int ret_id;
+ int r;
+
+ if (end < start || !step)
+ return -EINVAL;
+
+ n_entries = end - start;
+ if (dm_sector_div64(n_entries, step))
+ n_entries++;
+
+ if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
+ return -EOVERFLOW;
+
+ shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
+ if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
+ return -EOVERFLOW;
+
+ percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
+ if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
+ return -EOVERFLOW;
+
+ if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+ return -ENOMEM;
+
+ s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
+ if (!s)
+ return -ENOMEM;
+
+ s->n_entries = n_entries;
+ s->start = start;
+ s->end = end;
+ s->step = step;
+ s->shared_alloc_size = shared_alloc_size;
+ s->percpu_alloc_size = percpu_alloc_size;
+
+ s->program_id = kstrdup(program_id, GFP_KERNEL);
+ if (!s->program_id) {
+ r = -ENOMEM;
+ goto out;
+ }
+ s->aux_data = kstrdup(aux_data, GFP_KERNEL);
+ if (!s->aux_data) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ for (ni = 0; ni < n_entries; ni++) {
+ atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
+ atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
+ }
+
+ for_each_possible_cpu(cpu) {
+ p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
+ if (!p) {
+ r = -ENOMEM;
+ goto out;
+ }
+ s->stat_percpu[cpu] = p;
+ }
+
+ /*
+ * Suspend/resume to make sure there is no i/o in flight,
+ * so that newly created statistics will be exact.
+ *
+ * (note: we couldn't suspend earlier because we must not
+ * allocate memory while suspended)
+ */
+ suspend_callback(md);
+
+ mutex_lock(&stats->mutex);
+ s->id = 0;
+ list_for_each(l, &stats->list) {
+ tmp_s = container_of(l, struct dm_stat, list_entry);
+ if (WARN_ON(tmp_s->id < s->id)) {
+ r = -EINVAL;
+ goto out_unlock_resume;
+ }
+ if (tmp_s->id > s->id)
+ break;
+ if (unlikely(s->id == INT_MAX)) {
+ r = -ENFILE;
+ goto out_unlock_resume;
+ }
+ s->id++;
+ }
+ ret_id = s->id;
+ list_add_tail_rcu(&s->list_entry, l);
+ mutex_unlock(&stats->mutex);
+
+ resume_callback(md);
+
+ return ret_id;
+
+out_unlock_resume:
+ mutex_unlock(&stats->mutex);
+ resume_callback(md);
+out:
+ dm_stat_free(&s->rcu_head);
+ return r;
+}
+
+static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
+{
+ struct dm_stat *s;
+
+ list_for_each_entry(s, &stats->list, list_entry) {
+ if (s->id > id)
+ break;
+ if (s->id == id)
+ return s;
+ }
+
+ return NULL;
+}
+
+static int dm_stats_delete(struct dm_stats *stats, int id)
+{
+ struct dm_stat *s;
+ int cpu;
+
+ mutex_lock(&stats->mutex);
+
+ s = __dm_stats_find(stats, id);
+ if (!s) {
+ mutex_unlock(&stats->mutex);
+ return -ENOENT;
+ }
+
+ list_del_rcu(&s->list_entry);
+ mutex_unlock(&stats->mutex);
+
+ /*
+ * vfree can't be called from RCU callback
+ */
+ for_each_possible_cpu(cpu)
+ if (is_vmalloc_addr(s->stat_percpu))
+ goto do_sync_free;
+ if (is_vmalloc_addr(s)) {
+do_sync_free:
+ synchronize_rcu_expedited();
+ dm_stat_free(&s->rcu_head);
+ } else {
+ ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
+ call_rcu(&s->rcu_head, dm_stat_free);
+ }
+ return 0;
+}
+
+static int dm_stats_list(struct dm_stats *stats, const char *program,
+ char *result, unsigned maxlen)
+{
+ struct dm_stat *s;
+ sector_t len;
+ unsigned sz = 0;
+
+ /*
+ * Output format:
+ * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
+ */
+
+ mutex_lock(&stats->mutex);
+ list_for_each_entry(s, &stats->list, list_entry) {
+ if (!program || !strcmp(program, s->program_id)) {
+ len = s->end - s->start;
+ DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id,
+ (unsigned long long)s->start,
+ (unsigned long long)len,
+ (unsigned long long)s->step,
+ s->program_id,
+ s->aux_data);
+ }
+ }
+ mutex_unlock(&stats->mutex);
+
+ return 1;
+}
+
+static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
+{
+ /*
+ * This is racy, but so is part_round_stats_single.
+ */
+ unsigned long now = jiffies;
+ unsigned in_flight_read;
+ unsigned in_flight_write;
+ unsigned long difference = now - shared->stamp;
+
+ if (!difference)
+ return;
+ in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
+ in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
+ if (in_flight_read)
+ p->io_ticks[READ] += difference;
+ if (in_flight_write)
+ p->io_ticks[WRITE] += difference;
+ if (in_flight_read + in_flight_write) {
+ p->io_ticks_total += difference;
+ p->time_in_queue += (in_flight_read + in_flight_write) * difference;
+ }
+ shared->stamp = now;
+}
+
+static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
+ unsigned long bi_rw, sector_t len, bool merged,
+ bool end, unsigned long duration)
+{
+ unsigned long idx = bi_rw & REQ_WRITE;
+ struct dm_stat_shared *shared = &s->stat_shared[entry];
+ struct dm_stat_percpu *p;
+
+ /*
+ * For strict correctness we should use local_irq_save/restore
+ * instead of preempt_disable/enable.
+ *
+ * preempt_disable/enable is racy if the driver finishes bios
+ * from non-interrupt context as well as from interrupt context
+ * or from more different interrupts.
+ *
+ * On 64-bit architectures the race only results in not counting some
+ * events, so it is acceptable. On 32-bit architectures the race could
+ * cause the counter going off by 2^32, so we need to do proper locking
+ * there.
+ *
+ * part_stat_lock()/part_stat_unlock() have this race too.
+ */
+#if BITS_PER_LONG == 32
+ unsigned long flags;
+ local_irq_save(flags);
+#else
+ preempt_disable();
+#endif
+ p = &s->stat_percpu[smp_processor_id()][entry];
+
+ if (!end) {
+ dm_stat_round(shared, p);
+ atomic_inc(&shared->in_flight[idx]);
+ } else {
+ dm_stat_round(shared, p);
+ atomic_dec(&shared->in_flight[idx]);
+ p->sectors[idx] += len;
+ p->ios[idx] += 1;
+ p->merges[idx] += merged;
+ p->ticks[idx] += duration;
+ }
+
+#if BITS_PER_LONG == 32
+ local_irq_restore(flags);
+#else
+ preempt_enable();
+#endif
+}
+
+static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
+ sector_t bi_sector, sector_t end_sector,
+ bool end, unsigned long duration,
+ struct dm_stats_aux *stats_aux)
+{
+ sector_t rel_sector, offset, todo, fragment_len;
+ size_t entry;
+
+ if (end_sector <= s->start || bi_sector >= s->end)
+ return;
+ if (unlikely(bi_sector < s->start)) {
+ rel_sector = 0;
+ todo = end_sector - s->start;
+ } else {
+ rel_sector = bi_sector - s->start;
+ todo = end_sector - bi_sector;
+ }
+ if (unlikely(end_sector > s->end))
+ todo -= (end_sector - s->end);
+
+ offset = dm_sector_div64(rel_sector, s->step);
+ entry = rel_sector;
+ do {
+ if (WARN_ON_ONCE(entry >= s->n_entries)) {
+ DMCRIT("Invalid area access in region id %d", s->id);
+ return;
+ }
+ fragment_len = todo;
+ if (fragment_len > s->step - offset)
+ fragment_len = s->step - offset;
+ dm_stat_for_entry(s, entry, bi_rw, fragment_len,
+ stats_aux->merged, end, duration);
+ todo -= fragment_len;
+ entry++;
+ offset = 0;
+ } while (unlikely(todo != 0));
+}
+
+void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+ sector_t bi_sector, unsigned bi_sectors, bool end,
+ unsigned long duration, struct dm_stats_aux *stats_aux)
+{
+ struct dm_stat *s;
+ sector_t end_sector;
+ struct dm_stats_last_position *last;
+
+ if (unlikely(!bi_sectors))
+ return;
+
+ end_sector = bi_sector + bi_sectors;
+
+ if (!end) {
+ /*
+ * A race condition can at worst result in the merged flag being
+ * misrepresented, so we don't have to disable preemption here.
+ */
+ last = raw_cpu_ptr(stats->last);
+ stats_aux->merged =
+ (bi_sector == (ACCESS_ONCE(last->last_sector) &&
+ ((bi_rw & (REQ_WRITE | REQ_DISCARD)) ==
+ (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD)))
+ ));
+ ACCESS_ONCE(last->last_sector) = end_sector;
+ ACCESS_ONCE(last->last_rw) = bi_rw;
+ }
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(s, &stats->list, list_entry)
+ __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
+
+ rcu_read_unlock();
+}
+
+static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
+ struct dm_stat *s, size_t x)
+{
+ int cpu;
+ struct dm_stat_percpu *p;
+
+ local_irq_disable();
+ p = &s->stat_percpu[smp_processor_id()][x];
+ dm_stat_round(shared, p);
+ local_irq_enable();
+
+ memset(&shared->tmp, 0, sizeof(shared->tmp));
+ for_each_possible_cpu(cpu) {
+ p = &s->stat_percpu[cpu][x];
+ shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
+ shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
+ shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
+ shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
+ shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
+ shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
+ shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
+ shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
+ shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
+ shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
+ shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
+ shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+ }
+}
+
+static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
+ bool init_tmp_percpu_totals)
+{
+ size_t x;
+ struct dm_stat_shared *shared;
+ struct dm_stat_percpu *p;
+
+ for (x = idx_start; x < idx_end; x++) {
+ shared = &s->stat_shared[x];
+ if (init_tmp_percpu_totals)
+ __dm_stat_init_temporary_percpu_totals(shared, s, x);
+ local_irq_disable();
+ p = &s->stat_percpu[smp_processor_id()][x];
+ p->sectors[READ] -= shared->tmp.sectors[READ];
+ p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
+ p->ios[READ] -= shared->tmp.ios[READ];
+ p->ios[WRITE] -= shared->tmp.ios[WRITE];
+ p->merges[READ] -= shared->tmp.merges[READ];
+ p->merges[WRITE] -= shared->tmp.merges[WRITE];
+ p->ticks[READ] -= shared->tmp.ticks[READ];
+ p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
+ p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
+ p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
+ p->io_ticks_total -= shared->tmp.io_ticks_total;
+ p->time_in_queue -= shared->tmp.time_in_queue;
+ local_irq_enable();
+ }
+}
+
+static int dm_stats_clear(struct dm_stats *stats, int id)
+{
+ struct dm_stat *s;
+
+ mutex_lock(&stats->mutex);
+
+ s = __dm_stats_find(stats, id);
+ if (!s) {
+ mutex_unlock(&stats->mutex);
+ return -ENOENT;
+ }
+
+ __dm_stat_clear(s, 0, s->n_entries, true);
+
+ mutex_unlock(&stats->mutex);
+
+ return 1;
+}
+
+/*
+ * This is like jiffies_to_msec, but works for 64-bit values.
+ */
+static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
+{
+ unsigned long long result = 0;
+ unsigned mult;
+
+ if (j)
+ result = jiffies_to_msecs(j & 0x3fffff);
+ if (j >= 1 << 22) {
+ mult = jiffies_to_msecs(1 << 22);
+ result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
+ }
+ if (j >= 1ULL << 44)
+ result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
+
+ return result;
+}
+
+static int dm_stats_print(struct dm_stats *stats, int id,
+ size_t idx_start, size_t idx_len,
+ bool clear, char *result, unsigned maxlen)
+{
+ unsigned sz = 0;
+ struct dm_stat *s;
+ size_t x;
+ sector_t start, end, step;
+ size_t idx_end;
+ struct dm_stat_shared *shared;
+
+ /*
+ * Output format:
+ * <start_sector>+<length> counters
+ */
+
+ mutex_lock(&stats->mutex);
+
+ s = __dm_stats_find(stats, id);
+ if (!s) {
+ mutex_unlock(&stats->mutex);
+ return -ENOENT;
+ }
+
+ idx_end = idx_start + idx_len;
+ if (idx_end < idx_start ||
+ idx_end > s->n_entries)
+ idx_end = s->n_entries;
+
+ if (idx_start > idx_end)
+ idx_start = idx_end;
+
+ step = s->step;
+ start = s->start + (step * idx_start);
+
+ for (x = idx_start; x < idx_end; x++, start = end) {
+ shared = &s->stat_shared[x];
+ end = start + step;
+ if (unlikely(end > s->end))
+ end = s->end;
+
+ __dm_stat_init_temporary_percpu_totals(shared, s, x);
+
+ DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+ (unsigned long long)start,
+ (unsigned long long)step,
+ shared->tmp.ios[READ],
+ shared->tmp.merges[READ],
+ shared->tmp.sectors[READ],
+ dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
+ shared->tmp.ios[WRITE],
+ shared->tmp.merges[WRITE],
+ shared->tmp.sectors[WRITE],
+ dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
+ dm_stat_in_flight(shared),
+ dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
+ dm_jiffies_to_msec64(shared->tmp.time_in_queue),
+ dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
+ dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
+
+ if (unlikely(sz + 1 >= maxlen))
+ goto buffer_overflow;
+ }
+
+ if (clear)
+ __dm_stat_clear(s, idx_start, idx_end, false);
+
+buffer_overflow:
+ mutex_unlock(&stats->mutex);
+
+ return 1;
+}
+
+static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
+{
+ struct dm_stat *s;
+ const char *new_aux_data;
+
+ mutex_lock(&stats->mutex);
+
+ s = __dm_stats_find(stats, id);
+ if (!s) {
+ mutex_unlock(&stats->mutex);
+ return -ENOENT;
+ }
+
+ new_aux_data = kstrdup(aux_data, GFP_KERNEL);
+ if (!new_aux_data) {
+ mutex_unlock(&stats->mutex);
+ return -ENOMEM;
+ }
+
+ kfree(s->aux_data);
+ s->aux_data = new_aux_data;
+
+ mutex_unlock(&stats->mutex);
+
+ return 0;
+}
+
+static int message_stats_create(struct mapped_device *md,
+ unsigned argc, char **argv,
+ char *result, unsigned maxlen)
+{
+ int id;
+ char dummy;
+ unsigned long long start, end, len, step;
+ unsigned divisor;
+ const char *program_id, *aux_data;
+
+ /*
+ * Input format:
+ * <range> <step> [<program_id> [<aux_data>]]
+ */
+
+ if (argc < 3 || argc > 5)
+ return -EINVAL;
+
+ if (!strcmp(argv[1], "-")) {
+ start = 0;
+ len = dm_get_size(md);
+ if (!len)
+ len = 1;
+ } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
+ start != (sector_t)start || len != (sector_t)len)
+ return -EINVAL;
+
+ end = start + len;
+ if (start >= end)
+ return -EINVAL;
+
+ if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+ if (!divisor)
+ return -EINVAL;
+ step = end - start;
+ if (do_div(step, divisor))
+ step++;
+ if (!step)
+ step = 1;
+ } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
+ step != (sector_t)step || !step)
+ return -EINVAL;
+
+ program_id = "-";
+ aux_data = "-";
+
+ if (argc > 3)
+ program_id = argv[3];
+
+ if (argc > 4)
+ aux_data = argv[4];
+
+ /*
+ * If a buffer overflow happens after we created the region,
+ * it's too late (the userspace would retry with a larger
+ * buffer, but the region id that caused the overflow is already
+ * leaked). So we must detect buffer overflow in advance.
+ */
+ snprintf(result, maxlen, "%d", INT_MAX);
+ if (dm_message_test_buffer_overflow(result, maxlen))
+ return 1;
+
+ id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
+ dm_internal_suspend_fast, dm_internal_resume_fast, md);
+ if (id < 0)
+ return id;
+
+ snprintf(result, maxlen, "%d", id);
+
+ return 1;
+}
+
+static int message_stats_delete(struct mapped_device *md,
+ unsigned argc, char **argv)
+{
+ int id;
+ char dummy;
+
+ if (argc != 2)
+ return -EINVAL;
+
+ if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
+ return -EINVAL;
+
+ return dm_stats_delete(dm_get_stats(md), id);
+}
+
+static int message_stats_clear(struct mapped_device *md,
+ unsigned argc, char **argv)
+{
+ int id;
+ char dummy;
+
+ if (argc != 2)
+ return -EINVAL;
+
+ if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
+ return -EINVAL;
+
+ return dm_stats_clear(dm_get_stats(md), id);
+}
+
+static int message_stats_list(struct mapped_device *md,
+ unsigned argc, char **argv,
+ char *result, unsigned maxlen)
+{
+ int r;
+ const char *program = NULL;
+
+ if (argc < 1 || argc > 2)
+ return -EINVAL;
+
+ if (argc > 1) {
+ program = kstrdup(argv[1], GFP_KERNEL);
+ if (!program)
+ return -ENOMEM;
+ }
+
+ r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
+
+ kfree(program);
+
+ return r;
+}
+
+static int message_stats_print(struct mapped_device *md,
+ unsigned argc, char **argv, bool clear,
+ char *result, unsigned maxlen)
+{
+ int id;
+ char dummy;
+ unsigned long idx_start = 0, idx_len = ULONG_MAX;
+
+ if (argc != 2 && argc != 4)
+ return -EINVAL;
+
+ if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
+ return -EINVAL;
+
+ if (argc > 3) {
+ if (strcmp(argv[2], "-") &&
+ sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
+ return -EINVAL;
+ if (strcmp(argv[3], "-") &&
+ sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
+ return -EINVAL;
+ }
+
+ return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
+ result, maxlen);
+}
+
+static int message_stats_set_aux(struct mapped_device *md,
+ unsigned argc, char **argv)
+{
+ int id;
+ char dummy;
+
+ if (argc != 3)
+ return -EINVAL;
+
+ if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
+ return -EINVAL;
+
+ return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
+}
+
+int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
+ char *result, unsigned maxlen)
+{
+ int r;
+
+ if (dm_request_based(md)) {
+ DMWARN("Statistics are only supported for bio-based devices");
+ return -EOPNOTSUPP;
+ }
+
+ /* All messages here must start with '@' */
+ if (!strcasecmp(argv[0], "@stats_create"))
+ r = message_stats_create(md, argc, argv, result, maxlen);
+ else if (!strcasecmp(argv[0], "@stats_delete"))
+ r = message_stats_delete(md, argc, argv);
+ else if (!strcasecmp(argv[0], "@stats_clear"))
+ r = message_stats_clear(md, argc, argv);
+ else if (!strcasecmp(argv[0], "@stats_list"))
+ r = message_stats_list(md, argc, argv, result, maxlen);
+ else if (!strcasecmp(argv[0], "@stats_print"))
+ r = message_stats_print(md, argc, argv, false, result, maxlen);
+ else if (!strcasecmp(argv[0], "@stats_print_clear"))
+ r = message_stats_print(md, argc, argv, true, result, maxlen);
+ else if (!strcasecmp(argv[0], "@stats_set_aux"))
+ r = message_stats_set_aux(md, argc, argv);
+ else
+ return 2; /* this wasn't a stats message */
+
+ if (r == -EINVAL)
+ DMWARN("Invalid parameters for message %s", argv[0]);
+
+ return r;
+}
+
+int __init dm_statistics_init(void)
+{
+ shared_memory_amount = 0;
+ dm_stat_need_rcu_barrier = 0;
+ return 0;
+}
+
+void dm_statistics_exit(void)
+{
+ if (dm_stat_need_rcu_barrier)
+ rcu_barrier();
+ if (WARN_ON(shared_memory_amount))
+ DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
+}
+
+module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
+MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
new file mode 100644
index 000000000..e7c4984bf
--- /dev/null
+++ b/drivers/md/dm-stats.h
@@ -0,0 +1,40 @@
+#ifndef DM_STATS_H
+#define DM_STATS_H
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+
+int dm_statistics_init(void);
+void dm_statistics_exit(void);
+
+struct dm_stats {
+ struct mutex mutex;
+ struct list_head list; /* list of struct dm_stat */
+ struct dm_stats_last_position __percpu *last;
+ sector_t last_sector;
+ unsigned last_rw;
+};
+
+struct dm_stats_aux {
+ bool merged;
+};
+
+void dm_stats_init(struct dm_stats *st);
+void dm_stats_cleanup(struct dm_stats *st);
+
+struct mapped_device;
+
+int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
+ char *result, unsigned maxlen);
+
+void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+ sector_t bi_sector, unsigned bi_sectors, bool end,
+ unsigned long duration, struct dm_stats_aux *aux);
+
+static inline bool dm_stats_used(struct dm_stats *st)
+{
+ return !list_empty(&st->list);
+}
+
+#endif
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
new file mode 100644
index 000000000..f8b37d4c0
--- /dev/null
+++ b/drivers/md/dm-stripe.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+
+#define DM_MSG_PREFIX "striped"
+#define DM_IO_ERROR_THRESHOLD 15
+
+struct stripe {
+ struct dm_dev *dev;
+ sector_t physical_start;
+
+ atomic_t error_count;
+};
+
+struct stripe_c {
+ uint32_t stripes;
+ int stripes_shift;
+
+ /* The size of this target / num. stripes */
+ sector_t stripe_width;
+
+ uint32_t chunk_size;
+ int chunk_size_shift;
+
+ /* Needed for handling events */
+ struct dm_target *ti;
+
+ /* Work struct used for triggering events*/
+ struct work_struct trigger_event;
+
+ struct stripe stripe[0];
+};
+
+/*
+ * An event is triggered whenever a drive
+ * drops out of a stripe volume.
+ */
+static void trigger_event(struct work_struct *work)
+{
+ struct stripe_c *sc = container_of(work, struct stripe_c,
+ trigger_event);
+ dm_table_event(sc->ti->table);
+}
+
+static inline struct stripe_c *alloc_context(unsigned int stripes)
+{
+ size_t len;
+
+ if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
+ stripes))
+ return NULL;
+
+ len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
+
+ return kmalloc(len, GFP_KERNEL);
+}
+
+/*
+ * Parse a single <dev> <sector> pair
+ */
+static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
+ unsigned int stripe, char **argv)
+{
+ unsigned long long start;
+ char dummy;
+
+ if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
+ return -EINVAL;
+
+ if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ &sc->stripe[stripe].dev))
+ return -ENXIO;
+
+ sc->stripe[stripe].physical_start = start;
+
+ return 0;
+}
+
+/*
+ * Construct a striped mapping.
+ * <number of stripes> <chunk size> [<dev_path> <offset>]+
+ */
+static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct stripe_c *sc;
+ sector_t width, tmp_len;
+ uint32_t stripes;
+ uint32_t chunk_size;
+ int r;
+ unsigned int i;
+
+ if (argc < 2) {
+ ti->error = "Not enough arguments";
+ return -EINVAL;
+ }
+
+ if (kstrtouint(argv[0], 10, &stripes) || !stripes) {
+ ti->error = "Invalid stripe count";
+ return -EINVAL;
+ }
+
+ if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) {
+ ti->error = "Invalid chunk_size";
+ return -EINVAL;
+ }
+
+ width = ti->len;
+ if (sector_div(width, stripes)) {
+ ti->error = "Target length not divisible by "
+ "number of stripes";
+ return -EINVAL;
+ }
+
+ tmp_len = width;
+ if (sector_div(tmp_len, chunk_size)) {
+ ti->error = "Target length not divisible by "
+ "chunk size";
+ return -EINVAL;
+ }
+
+ /*
+ * Do we have enough arguments for that many stripes ?
+ */
+ if (argc != (2 + 2 * stripes)) {
+ ti->error = "Not enough destinations "
+ "specified";
+ return -EINVAL;
+ }
+
+ sc = alloc_context(stripes);
+ if (!sc) {
+ ti->error = "Memory allocation for striped context "
+ "failed";
+ return -ENOMEM;
+ }
+
+ INIT_WORK(&sc->trigger_event, trigger_event);
+
+ /* Set pointer to dm target; used in trigger_event */
+ sc->ti = ti;
+ sc->stripes = stripes;
+ sc->stripe_width = width;
+
+ if (stripes & (stripes - 1))
+ sc->stripes_shift = -1;
+ else
+ sc->stripes_shift = __ffs(stripes);
+
+ r = dm_set_target_max_io_len(ti, chunk_size);
+ if (r) {
+ kfree(sc);
+ return r;
+ }
+
+ ti->num_flush_bios = stripes;
+ ti->num_discard_bios = stripes;
+ ti->num_write_same_bios = stripes;
+
+ sc->chunk_size = chunk_size;
+ if (chunk_size & (chunk_size - 1))
+ sc->chunk_size_shift = -1;
+ else
+ sc->chunk_size_shift = __ffs(chunk_size);
+
+ /*
+ * Get the stripe destinations.
+ */
+ for (i = 0; i < stripes; i++) {
+ argv += 2;
+
+ r = get_stripe(ti, sc, i, argv);
+ if (r < 0) {
+ ti->error = "Couldn't parse stripe destination";
+ while (i--)
+ dm_put_device(ti, sc->stripe[i].dev);
+ kfree(sc);
+ return r;
+ }
+ atomic_set(&(sc->stripe[i].error_count), 0);
+ }
+
+ ti->private = sc;
+
+ return 0;
+}
+
+static void stripe_dtr(struct dm_target *ti)
+{
+ unsigned int i;
+ struct stripe_c *sc = (struct stripe_c *) ti->private;
+
+ for (i = 0; i < sc->stripes; i++)
+ dm_put_device(ti, sc->stripe[i].dev);
+
+ flush_work(&sc->trigger_event);
+ kfree(sc);
+}
+
+static void stripe_map_sector(struct stripe_c *sc, sector_t sector,
+ uint32_t *stripe, sector_t *result)
+{
+ sector_t chunk = dm_target_offset(sc->ti, sector);
+ sector_t chunk_offset;
+
+ if (sc->chunk_size_shift < 0)
+ chunk_offset = sector_div(chunk, sc->chunk_size);
+ else {
+ chunk_offset = chunk & (sc->chunk_size - 1);
+ chunk >>= sc->chunk_size_shift;
+ }
+
+ if (sc->stripes_shift < 0)
+ *stripe = sector_div(chunk, sc->stripes);
+ else {
+ *stripe = chunk & (sc->stripes - 1);
+ chunk >>= sc->stripes_shift;
+ }
+
+ if (sc->chunk_size_shift < 0)
+ chunk *= sc->chunk_size;
+ else
+ chunk <<= sc->chunk_size_shift;
+
+ *result = chunk + chunk_offset;
+}
+
+static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector,
+ uint32_t target_stripe, sector_t *result)
+{
+ uint32_t stripe;
+
+ stripe_map_sector(sc, sector, &stripe, result);
+ if (stripe == target_stripe)
+ return;
+
+ /* round down */
+ sector = *result;
+ if (sc->chunk_size_shift < 0)
+ *result -= sector_div(sector, sc->chunk_size);
+ else
+ *result = sector & ~(sector_t)(sc->chunk_size - 1);
+
+ if (target_stripe < stripe)
+ *result += sc->chunk_size; /* next chunk */
+}
+
+static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
+ uint32_t target_stripe)
+{
+ sector_t begin, end;
+
+ stripe_map_range_sector(sc, bio->bi_iter.bi_sector,
+ target_stripe, &begin);
+ stripe_map_range_sector(sc, bio_end_sector(bio),
+ target_stripe, &end);
+ if (begin < end) {
+ bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
+ bio->bi_iter.bi_sector = begin +
+ sc->stripe[target_stripe].physical_start;
+ bio->bi_iter.bi_size = to_bytes(end - begin);
+ return DM_MAPIO_REMAPPED;
+ } else {
+ /* The range doesn't map to the target stripe */
+ bio_endio(bio, 0);
+ return DM_MAPIO_SUBMITTED;
+ }
+}
+
+static int stripe_map(struct dm_target *ti, struct bio *bio)
+{
+ struct stripe_c *sc = ti->private;
+ uint32_t stripe;
+ unsigned target_bio_nr;
+
+ if (bio->bi_rw & REQ_FLUSH) {
+ target_bio_nr = dm_bio_get_target_bio_nr(bio);
+ BUG_ON(target_bio_nr >= sc->stripes);
+ bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
+ return DM_MAPIO_REMAPPED;
+ }
+ if (unlikely(bio->bi_rw & REQ_DISCARD) ||
+ unlikely(bio->bi_rw & REQ_WRITE_SAME)) {
+ target_bio_nr = dm_bio_get_target_bio_nr(bio);
+ BUG_ON(target_bio_nr >= sc->stripes);
+ return stripe_map_range(sc, bio, target_bio_nr);
+ }
+
+ stripe_map_sector(sc, bio->bi_iter.bi_sector,
+ &stripe, &bio->bi_iter.bi_sector);
+
+ bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start;
+ bio->bi_bdev = sc->stripe[stripe].dev->bdev;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * Stripe status:
+ *
+ * INFO
+ * #stripes [stripe_name <stripe_name>] [group word count]
+ * [error count 'A|D' <error count 'A|D'>]
+ *
+ * TABLE
+ * #stripes [stripe chunk size]
+ * [stripe_name physical_start <stripe_name physical_start>]
+ *
+ */
+
+static void stripe_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct stripe_c *sc = (struct stripe_c *) ti->private;
+ char buffer[sc->stripes + 1];
+ unsigned int sz = 0;
+ unsigned int i;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%d ", sc->stripes);
+ for (i = 0; i < sc->stripes; i++) {
+ DMEMIT("%s ", sc->stripe[i].dev->name);
+ buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ?
+ 'D' : 'A';
+ }
+ buffer[i] = '\0';
+ DMEMIT("1 %s", buffer);
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%d %llu", sc->stripes,
+ (unsigned long long)sc->chunk_size);
+ for (i = 0; i < sc->stripes; i++)
+ DMEMIT(" %s %llu", sc->stripe[i].dev->name,
+ (unsigned long long)sc->stripe[i].physical_start);
+ break;
+ }
+}
+
+static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+ unsigned i;
+ char major_minor[16];
+ struct stripe_c *sc = ti->private;
+
+ if (!error)
+ return 0; /* I/O complete */
+
+ if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD))
+ return error;
+
+ if (error == -EOPNOTSUPP)
+ return error;
+
+ memset(major_minor, 0, sizeof(major_minor));
+ sprintf(major_minor, "%d:%d",
+ MAJOR(disk_devt(bio->bi_bdev->bd_disk)),
+ MINOR(disk_devt(bio->bi_bdev->bd_disk)));
+
+ /*
+ * Test to see which stripe drive triggered the event
+ * and increment error count for all stripes on that device.
+ * If the error count for a given device exceeds the threshold
+ * value we will no longer trigger any further events.
+ */
+ for (i = 0; i < sc->stripes; i++)
+ if (!strcmp(sc->stripe[i].dev->name, major_minor)) {
+ atomic_inc(&(sc->stripe[i].error_count));
+ if (atomic_read(&(sc->stripe[i].error_count)) <
+ DM_IO_ERROR_THRESHOLD)
+ schedule_work(&sc->trigger_event);
+ }
+
+ return error;
+}
+
+static int stripe_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct stripe_c *sc = ti->private;
+ int ret = 0;
+ unsigned i = 0;
+
+ do {
+ ret = fn(ti, sc->stripe[i].dev,
+ sc->stripe[i].physical_start,
+ sc->stripe_width, data);
+ } while (!ret && ++i < sc->stripes);
+
+ return ret;
+}
+
+static void stripe_io_hints(struct dm_target *ti,
+ struct queue_limits *limits)
+{
+ struct stripe_c *sc = ti->private;
+ unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT;
+
+ blk_limits_io_min(limits, chunk_size);
+ blk_limits_io_opt(limits, chunk_size * sc->stripes);
+}
+
+static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct stripe_c *sc = ti->private;
+ sector_t bvm_sector = bvm->bi_sector;
+ uint32_t stripe;
+ struct request_queue *q;
+
+ stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector);
+
+ q = bdev_get_queue(sc->stripe[stripe].dev->bdev);
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = sc->stripe[stripe].dev->bdev;
+ bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector;
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static struct target_type stripe_target = {
+ .name = "striped",
+ .version = {1, 5, 1},
+ .module = THIS_MODULE,
+ .ctr = stripe_ctr,
+ .dtr = stripe_dtr,
+ .map = stripe_map,
+ .end_io = stripe_end_io,
+ .status = stripe_status,
+ .iterate_devices = stripe_iterate_devices,
+ .io_hints = stripe_io_hints,
+ .merge = stripe_merge,
+};
+
+int __init dm_stripe_init(void)
+{
+ int r;
+
+ r = dm_register_target(&stripe_target);
+ if (r < 0) {
+ DMWARN("target registration failed");
+ return r;
+ }
+
+ return r;
+}
+
+void dm_stripe_exit(void)
+{
+ dm_unregister_target(&stripe_target);
+}
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
new file mode 100644
index 000000000..50fca469c
--- /dev/null
+++ b/drivers/md/dm-switch.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
+ * Copyright (C) 2011-2013 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ *
+ * dm-switch is a device-mapper target that maps IO to underlying block
+ * devices efficiently when there are a large number of fixed-sized
+ * address regions but there is no simple pattern to allow for a compact
+ * mapping representation such as dm-stripe.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * One region_table_slot_t holds <region_entries_per_slot> region table
+ * entries each of which is <region_table_entry_bits> in size.
+ */
+typedef unsigned long region_table_slot_t;
+
+/*
+ * A device with the offset to its start sector.
+ */
+struct switch_path {
+ struct dm_dev *dmdev;
+ sector_t start;
+};
+
+/*
+ * Context block for a dm switch device.
+ */
+struct switch_ctx {
+ struct dm_target *ti;
+
+ unsigned nr_paths; /* Number of paths in path_list. */
+
+ unsigned region_size; /* Region size in 512-byte sectors */
+ unsigned long nr_regions; /* Number of regions making up the device */
+ signed char region_size_bits; /* log2 of region_size or -1 */
+
+ unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
+ unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
+ signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
+
+ region_table_slot_t *region_table; /* Region table */
+
+ /*
+ * Array of dm devices to switch between.
+ */
+ struct switch_path path_list[0];
+};
+
+static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
+ unsigned region_size)
+{
+ struct switch_ctx *sctx;
+
+ sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
+ GFP_KERNEL);
+ if (!sctx)
+ return NULL;
+
+ sctx->ti = ti;
+ sctx->region_size = region_size;
+
+ ti->private = sctx;
+
+ return sctx;
+}
+
+static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
+{
+ struct switch_ctx *sctx = ti->private;
+ sector_t nr_regions = ti->len;
+ sector_t nr_slots;
+
+ if (!(sctx->region_size & (sctx->region_size - 1)))
+ sctx->region_size_bits = __ffs(sctx->region_size);
+ else
+ sctx->region_size_bits = -1;
+
+ sctx->region_table_entry_bits = 1;
+ while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
+ (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
+ sctx->region_table_entry_bits++;
+
+ sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
+ if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
+ sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
+ else
+ sctx->region_entries_per_slot_bits = -1;
+
+ if (sector_div(nr_regions, sctx->region_size))
+ nr_regions++;
+
+ sctx->nr_regions = nr_regions;
+ if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
+ ti->error = "Region table too large";
+ return -EINVAL;
+ }
+
+ nr_slots = nr_regions;
+ if (sector_div(nr_slots, sctx->region_entries_per_slot))
+ nr_slots++;
+
+ if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
+ ti->error = "Region table too large";
+ return -EINVAL;
+ }
+
+ sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
+ if (!sctx->region_table) {
+ ti->error = "Cannot allocate region table";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
+ unsigned long *region_index, unsigned *bit)
+{
+ if (sctx->region_entries_per_slot_bits >= 0) {
+ *region_index = region_nr >> sctx->region_entries_per_slot_bits;
+ *bit = region_nr & (sctx->region_entries_per_slot - 1);
+ } else {
+ *region_index = region_nr / sctx->region_entries_per_slot;
+ *bit = region_nr % sctx->region_entries_per_slot;
+ }
+
+ *bit *= sctx->region_table_entry_bits;
+}
+
+static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
+{
+ unsigned long region_index;
+ unsigned bit;
+
+ switch_get_position(sctx, region_nr, &region_index, &bit);
+
+ return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+ ((1 << sctx->region_table_entry_bits) - 1);
+}
+
+/*
+ * Find which path to use at given offset.
+ */
+static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
+{
+ unsigned path_nr;
+ sector_t p;
+
+ p = offset;
+ if (sctx->region_size_bits >= 0)
+ p >>= sctx->region_size_bits;
+ else
+ sector_div(p, sctx->region_size);
+
+ path_nr = switch_region_table_read(sctx, p);
+
+ /* This can only happen if the processor uses non-atomic stores. */
+ if (unlikely(path_nr >= sctx->nr_paths))
+ path_nr = 0;
+
+ return path_nr;
+}
+
+static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
+ unsigned value)
+{
+ unsigned long region_index;
+ unsigned bit;
+ region_table_slot_t pte;
+
+ switch_get_position(sctx, region_nr, &region_index, &bit);
+
+ pte = sctx->region_table[region_index];
+ pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
+ pte |= (region_table_slot_t)value << bit;
+ sctx->region_table[region_index] = pte;
+}
+
+/*
+ * Fill the region table with an initial round robin pattern.
+ */
+static void initialise_region_table(struct switch_ctx *sctx)
+{
+ unsigned path_nr = 0;
+ unsigned long region_nr;
+
+ for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
+ switch_region_table_write(sctx, region_nr, path_nr);
+ if (++path_nr >= sctx->nr_paths)
+ path_nr = 0;
+ }
+}
+
+static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
+{
+ struct switch_ctx *sctx = ti->private;
+ unsigned long long start;
+ int r;
+
+ r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+ &sctx->path_list[sctx->nr_paths].dmdev);
+ if (r) {
+ ti->error = "Device lookup failed";
+ return r;
+ }
+
+ if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
+ ti->error = "Invalid device starting offset";
+ dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+ return -EINVAL;
+ }
+
+ sctx->path_list[sctx->nr_paths].start = start;
+
+ sctx->nr_paths++;
+
+ return 0;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+ struct switch_ctx *sctx = ti->private;
+
+ while (sctx->nr_paths--)
+ dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
+
+ vfree(sctx->region_table);
+ kfree(sctx);
+}
+
+/*
+ * Constructor arguments:
+ * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
+ * [<dev_path> <offset>]+
+ *
+ * Optional args are to allow for future extension: currently this
+ * parameter must be 0.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static struct dm_arg _args[] = {
+ {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
+ {1, UINT_MAX, "Invalid region size"},
+ {0, 0, "Invalid number of optional args"},
+ };
+
+ struct switch_ctx *sctx;
+ struct dm_arg_set as;
+ unsigned nr_paths, region_size, nr_optional_args;
+ int r;
+
+ as.argc = argc;
+ as.argv = argv;
+
+ r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
+ if (r)
+ return r;
+
+ r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
+ if (r)
+ return r;
+ /* parse optional arguments here, if we add any */
+
+ if (as.argc != nr_paths * 2) {
+ ti->error = "Incorrect number of path arguments";
+ return -EINVAL;
+ }
+
+ sctx = alloc_switch_ctx(ti, nr_paths, region_size);
+ if (!sctx) {
+ ti->error = "Cannot allocate redirection context";
+ return -ENOMEM;
+ }
+
+ r = dm_set_target_max_io_len(ti, region_size);
+ if (r)
+ goto error;
+
+ while (as.argc) {
+ r = parse_path(&as, ti);
+ if (r)
+ goto error;
+ }
+
+ r = alloc_region_table(ti, nr_paths);
+ if (r)
+ goto error;
+
+ initialise_region_table(sctx);
+
+ /* For UNMAP, sending the request down any path is sufficient */
+ ti->num_discard_bios = 1;
+
+ return 0;
+
+error:
+ switch_dtr(ti);
+
+ return r;
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio)
+{
+ struct switch_ctx *sctx = ti->private;
+ sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
+ unsigned path_nr = switch_get_path_nr(sctx, offset);
+
+ bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
+ bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers in the message as quickly as possible.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ * table-based parser condition-based parser
+ * PA-RISC 0.29s 0.31s
+ * Opteron 0.0495s 0.0498s
+ */
+static const unsigned char hex_table[256] = {
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+};
+
+static __always_inline unsigned long parse_hex(const char **string)
+{
+ unsigned char d;
+ unsigned long r = 0;
+
+ while ((d = hex_table[(unsigned char)**string]) < 16) {
+ r = (r << 4) | d;
+ (*string)++;
+ }
+
+ return r;
+}
+
+static int process_set_region_mappings(struct switch_ctx *sctx,
+ unsigned argc, char **argv)
+{
+ unsigned i;
+ unsigned long region_index = 0;
+
+ for (i = 1; i < argc; i++) {
+ unsigned long path_nr;
+ const char *string = argv[i];
+
+ if ((*string & 0xdf) == 'R') {
+ unsigned long cycle_length, num_write;
+
+ string++;
+ if (unlikely(*string == ',')) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ cycle_length = parse_hex(&string);
+ if (unlikely(*string != ',')) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ string++;
+ if (unlikely(!*string)) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ num_write = parse_hex(&string);
+ if (unlikely(*string)) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+
+ if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
+ DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
+ cycle_length - 1, region_index);
+ return -EINVAL;
+ }
+ if (unlikely(region_index + num_write < region_index) ||
+ unlikely(region_index + num_write >= sctx->nr_regions)) {
+ DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
+ region_index, num_write, sctx->nr_regions);
+ return -EINVAL;
+ }
+
+ while (num_write--) {
+ region_index++;
+ path_nr = switch_region_table_read(sctx, region_index - cycle_length);
+ switch_region_table_write(sctx, region_index, path_nr);
+ }
+
+ continue;
+ }
+
+ if (*string == ':')
+ region_index++;
+ else {
+ region_index = parse_hex(&string);
+ if (unlikely(*string != ':')) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ }
+
+ string++;
+ if (unlikely(!*string)) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+
+ path_nr = parse_hex(&string);
+ if (unlikely(*string)) {
+ DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
+ return -EINVAL;
+ }
+ if (unlikely(region_index >= sctx->nr_regions)) {
+ DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
+ return -EINVAL;
+ }
+ if (unlikely(path_nr >= sctx->nr_paths)) {
+ DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
+ return -EINVAL;
+ }
+
+ switch_region_table_write(sctx, region_index, path_nr);
+ }
+
+ return 0;
+}
+
+/*
+ * Messages are processed one-at-a-time.
+ *
+ * Only set_region_mappings is supported.
+ */
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static DEFINE_MUTEX(message_mutex);
+
+ struct switch_ctx *sctx = ti->private;
+ int r = -EINVAL;
+
+ mutex_lock(&message_mutex);
+
+ if (!strcasecmp(argv[0], "set_region_mappings"))
+ r = process_set_region_mappings(sctx, argc, argv);
+ else
+ DMWARN("Unrecognised message received.");
+
+ mutex_unlock(&message_mutex);
+
+ return r;
+}
+
+static void switch_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct switch_ctx *sctx = ti->private;
+ unsigned sz = 0;
+ int path_nr;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
+ for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
+ DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
+ (unsigned long long)sctx->path_list[path_nr].start);
+ break;
+ }
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the path for sector 0
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg)
+{
+ struct switch_ctx *sctx = ti->private;
+ struct block_device *bdev;
+ fmode_t mode;
+ unsigned path_nr;
+ int r = 0;
+
+ path_nr = switch_get_path_nr(sctx, 0);
+
+ bdev = sctx->path_list[path_nr].dmdev->bdev;
+ mode = sctx->path_list[path_nr].dmdev->mode;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int switch_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct switch_ctx *sctx = ti->private;
+ int path_nr;
+ int r;
+
+ for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
+ r = fn(ti, sctx->path_list[path_nr].dmdev,
+ sctx->path_list[path_nr].start, ti->len, data);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static struct target_type switch_target = {
+ .name = "switch",
+ .version = {1, 1, 0},
+ .module = THIS_MODULE,
+ .ctr = switch_ctr,
+ .dtr = switch_dtr,
+ .map = switch_map,
+ .message = switch_message,
+ .status = switch_status,
+ .ioctl = switch_ioctl,
+ .iterate_devices = switch_iterate_devices,
+};
+
+static int __init dm_switch_init(void)
+{
+ int r;
+
+ r = dm_register_target(&switch_target);
+ if (r < 0)
+ DMERR("dm_register_target() failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_switch_exit(void)
+{
+ dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
+MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
+MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
new file mode 100644
index 000000000..7e818f5f1
--- /dev/null
+++ b/drivers/md/dm-sysfs.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/sysfs.h>
+#include <linux/dm-ioctl.h>
+#include "dm.h"
+
+struct dm_sysfs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct mapped_device *, char *);
+ ssize_t (*store)(struct mapped_device *, const char *, size_t count);
+};
+
+#define DM_ATTR_RO(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+ __ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL)
+
+static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *page)
+{
+ struct dm_sysfs_attr *dm_attr;
+ struct mapped_device *md;
+ ssize_t ret;
+
+ dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+ if (!dm_attr->show)
+ return -EIO;
+
+ md = dm_get_from_kobject(kobj);
+ if (!md)
+ return -EINVAL;
+
+ ret = dm_attr->show(md, page);
+ dm_put(md);
+
+ return ret;
+}
+
+#define DM_ATTR_RW(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+ __ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store)
+
+static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t count)
+{
+ struct dm_sysfs_attr *dm_attr;
+ struct mapped_device *md;
+ ssize_t ret;
+
+ dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+ if (!dm_attr->store)
+ return -EIO;
+
+ md = dm_get_from_kobject(kobj);
+ if (!md)
+ return -EINVAL;
+
+ ret = dm_attr->store(md, page, count);
+ dm_put(md);
+
+ return ret;
+}
+
+static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
+{
+ if (dm_copy_name_and_uuid(md, buf, NULL))
+ return -EIO;
+
+ strcat(buf, "\n");
+ return strlen(buf);
+}
+
+static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
+{
+ if (dm_copy_name_and_uuid(md, NULL, buf))
+ return -EIO;
+
+ strcat(buf, "\n");
+ return strlen(buf);
+}
+
+static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
+{
+ sprintf(buf, "%d\n", dm_suspended_md(md));
+
+ return strlen(buf);
+}
+
+static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
+{
+ sprintf(buf, "%d\n", dm_use_blk_mq(md));
+
+ return strlen(buf);
+}
+
+static DM_ATTR_RO(name);
+static DM_ATTR_RO(uuid);
+static DM_ATTR_RO(suspended);
+static DM_ATTR_RO(use_blk_mq);
+static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
+
+static struct attribute *dm_attrs[] = {
+ &dm_attr_name.attr,
+ &dm_attr_uuid.attr,
+ &dm_attr_suspended.attr,
+ &dm_attr_use_blk_mq.attr,
+ &dm_attr_rq_based_seq_io_merge_deadline.attr,
+ NULL,
+};
+
+static const struct sysfs_ops dm_sysfs_ops = {
+ .show = dm_attr_show,
+ .store = dm_attr_store,
+};
+
+static struct kobj_type dm_ktype = {
+ .sysfs_ops = &dm_sysfs_ops,
+ .default_attrs = dm_attrs,
+ .release = dm_kobject_release,
+};
+
+/*
+ * Initialize kobj
+ * because nobody using md yet, no need to call explicit dm_get/put
+ */
+int dm_sysfs_init(struct mapped_device *md)
+{
+ return kobject_init_and_add(dm_kobject(md), &dm_ktype,
+ &disk_to_dev(dm_disk(md))->kobj,
+ "%s", "dm");
+}
+
+/*
+ * Remove kobj, called after all references removed
+ */
+void dm_sysfs_exit(struct mapped_device *md)
+{
+ struct kobject *kobj = dm_kobject(md);
+ kobject_put(kobj);
+ wait_for_completion(dm_get_completion_from_kobject(kobj));
+}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
new file mode 100644
index 000000000..16ba55ad7
--- /dev/null
+++ b/drivers/md/dm-table.c
@@ -0,0 +1,1707 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include <linux/blk-mq.h>
+#include <linux/mount.h>
+
+#define DM_MSG_PREFIX "table"
+
+#define MAX_DEPTH 16
+#define NODE_SIZE L1_CACHE_BYTES
+#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
+#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
+
+struct dm_table {
+ struct mapped_device *md;
+ unsigned type;
+
+ /* btree table */
+ unsigned int depth;
+ unsigned int counts[MAX_DEPTH]; /* in nodes */
+ sector_t *index[MAX_DEPTH];
+
+ unsigned int num_targets;
+ unsigned int num_allocated;
+ sector_t *highs;
+ struct dm_target *targets;
+
+ struct target_type *immutable_target_type;
+ unsigned integrity_supported:1;
+ unsigned singleton:1;
+
+ /*
+ * Indicates the rw permissions for the new logical
+ * device. This should be a combination of FMODE_READ
+ * and FMODE_WRITE.
+ */
+ fmode_t mode;
+
+ /* a list of devices used by this table */
+ struct list_head devices;
+
+ /* events get handed up using this callback */
+ void (*event_fn)(void *);
+ void *event_context;
+
+ struct dm_md_mempools *mempools;
+
+ struct list_head target_callbacks;
+};
+
+/*
+ * Similar to ceiling(log_size(n))
+ */
+static unsigned int int_log(unsigned int n, unsigned int base)
+{
+ int result = 0;
+
+ while (n > 1) {
+ n = dm_div_up(n, base);
+ result++;
+ }
+
+ return result;
+}
+
+/*
+ * Calculate the index of the child node of the n'th node k'th key.
+ */
+static inline unsigned int get_child(unsigned int n, unsigned int k)
+{
+ return (n * CHILDREN_PER_NODE) + k;
+}
+
+/*
+ * Return the n'th node of level l from table t.
+ */
+static inline sector_t *get_node(struct dm_table *t,
+ unsigned int l, unsigned int n)
+{
+ return t->index[l] + (n * KEYS_PER_NODE);
+}
+
+/*
+ * Return the highest key that you could lookup from the n'th
+ * node on level l of the btree.
+ */
+static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
+{
+ for (; l < t->depth - 1; l++)
+ n = get_child(n, CHILDREN_PER_NODE - 1);
+
+ if (n >= t->counts[l])
+ return (sector_t) - 1;
+
+ return get_node(t, l, n)[KEYS_PER_NODE - 1];
+}
+
+/*
+ * Fills in a level of the btree based on the highs of the level
+ * below it.
+ */
+static int setup_btree_index(unsigned int l, struct dm_table *t)
+{
+ unsigned int n, k;
+ sector_t *node;
+
+ for (n = 0U; n < t->counts[l]; n++) {
+ node = get_node(t, l, n);
+
+ for (k = 0U; k < KEYS_PER_NODE; k++)
+ node[k] = high(t, l + 1, get_child(n, k));
+ }
+
+ return 0;
+}
+
+void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
+{
+ unsigned long size;
+ void *addr;
+
+ /*
+ * Check that we're not going to overflow.
+ */
+ if (nmemb > (ULONG_MAX / elem_size))
+ return NULL;
+
+ size = nmemb * elem_size;
+ addr = vzalloc(size);
+
+ return addr;
+}
+EXPORT_SYMBOL(dm_vcalloc);
+
+/*
+ * highs, and targets are managed as dynamic arrays during a
+ * table load.
+ */
+static int alloc_targets(struct dm_table *t, unsigned int num)
+{
+ sector_t *n_highs;
+ struct dm_target *n_targets;
+
+ /*
+ * Allocate both the target array and offset array at once.
+ * Append an empty entry to catch sectors beyond the end of
+ * the device.
+ */
+ n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) +
+ sizeof(sector_t));
+ if (!n_highs)
+ return -ENOMEM;
+
+ n_targets = (struct dm_target *) (n_highs + num);
+
+ memset(n_highs, -1, sizeof(*n_highs) * num);
+ vfree(t->highs);
+
+ t->num_allocated = num;
+ t->highs = n_highs;
+ t->targets = n_targets;
+
+ return 0;
+}
+
+int dm_table_create(struct dm_table **result, fmode_t mode,
+ unsigned num_targets, struct mapped_device *md)
+{
+ struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
+
+ if (!t)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&t->devices);
+ INIT_LIST_HEAD(&t->target_callbacks);
+
+ if (!num_targets)
+ num_targets = KEYS_PER_NODE;
+
+ num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
+
+ if (!num_targets) {
+ kfree(t);
+ return -ENOMEM;
+ }
+
+ if (alloc_targets(t, num_targets)) {
+ kfree(t);
+ return -ENOMEM;
+ }
+
+ t->mode = mode;
+ t->md = md;
+ *result = t;
+ return 0;
+}
+
+static void free_devices(struct list_head *devices, struct mapped_device *md)
+{
+ struct list_head *tmp, *next;
+
+ list_for_each_safe(tmp, next, devices) {
+ struct dm_dev_internal *dd =
+ list_entry(tmp, struct dm_dev_internal, list);
+ DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s",
+ dm_device_name(md), dd->dm_dev->name);
+ dm_put_table_device(md, dd->dm_dev);
+ kfree(dd);
+ }
+}
+
+void dm_table_destroy(struct dm_table *t)
+{
+ unsigned int i;
+
+ if (!t)
+ return;
+
+ /* free the indexes */
+ if (t->depth >= 2)
+ vfree(t->index[t->depth - 2]);
+
+ /* free the targets */
+ for (i = 0; i < t->num_targets; i++) {
+ struct dm_target *tgt = t->targets + i;
+
+ if (tgt->type->dtr)
+ tgt->type->dtr(tgt);
+
+ dm_put_target_type(tgt->type);
+ }
+
+ vfree(t->highs);
+
+ /* free the device list */
+ free_devices(&t->devices, t->md);
+
+ dm_free_md_mempools(t->mempools);
+
+ kfree(t);
+}
+
+/*
+ * See if we've already got a device in the list.
+ */
+static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
+{
+ struct dm_dev_internal *dd;
+
+ list_for_each_entry (dd, l, list)
+ if (dd->dm_dev->bdev->bd_dev == dev)
+ return dd;
+
+ return NULL;
+}
+
+/*
+ * If possible, this checks an area of a destination device is invalid.
+ */
+static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q;
+ struct queue_limits *limits = data;
+ struct block_device *bdev = dev->bdev;
+ sector_t dev_size =
+ i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+ unsigned short logical_block_size_sectors =
+ limits->logical_block_size >> SECTOR_SHIFT;
+ char b[BDEVNAME_SIZE];
+
+ /*
+ * Some devices exist without request functions,
+ * such as loop devices not yet bound to backing files.
+ * Forbid the use of such devices.
+ */
+ q = bdev_get_queue(bdev);
+ if (!q || !q->make_request_fn) {
+ DMWARN("%s: %s is not yet initialised: "
+ "start=%llu, len=%llu, dev_size=%llu",
+ dm_device_name(ti->table->md), bdevname(bdev, b),
+ (unsigned long long)start,
+ (unsigned long long)len,
+ (unsigned long long)dev_size);
+ return 1;
+ }
+
+ if (!dev_size)
+ return 0;
+
+ if ((start >= dev_size) || (start + len > dev_size)) {
+ DMWARN("%s: %s too small for target: "
+ "start=%llu, len=%llu, dev_size=%llu",
+ dm_device_name(ti->table->md), bdevname(bdev, b),
+ (unsigned long long)start,
+ (unsigned long long)len,
+ (unsigned long long)dev_size);
+ return 1;
+ }
+
+ if (logical_block_size_sectors <= 1)
+ return 0;
+
+ if (start & (logical_block_size_sectors - 1)) {
+ DMWARN("%s: start=%llu not aligned to h/w "
+ "logical block size %u of %s",
+ dm_device_name(ti->table->md),
+ (unsigned long long)start,
+ limits->logical_block_size, bdevname(bdev, b));
+ return 1;
+ }
+
+ if (len & (logical_block_size_sectors - 1)) {
+ DMWARN("%s: len=%llu not aligned to h/w "
+ "logical block size %u of %s",
+ dm_device_name(ti->table->md),
+ (unsigned long long)len,
+ limits->logical_block_size, bdevname(bdev, b));
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * This upgrades the mode on an already open dm_dev, being
+ * careful to leave things as they were if we fail to reopen the
+ * device and not to touch the existing bdev field in case
+ * it is accessed concurrently inside dm_table_any_congested().
+ */
+static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
+ struct mapped_device *md)
+{
+ int r;
+ struct dm_dev *old_dev, *new_dev;
+
+ old_dev = dd->dm_dev;
+
+ r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev,
+ dd->dm_dev->mode | new_mode, &new_dev);
+ if (r)
+ return r;
+
+ dd->dm_dev = new_dev;
+ dm_put_table_device(md, old_dev);
+
+ return 0;
+}
+
+/*
+ * Add a device to the list, or just increment the usage count if
+ * it's already present.
+ */
+int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+ struct dm_dev **result)
+{
+ int r;
+ dev_t uninitialized_var(dev);
+ struct dm_dev_internal *dd;
+ struct dm_table *t = ti->table;
+ struct block_device *bdev;
+
+ BUG_ON(!t);
+
+ /* convert the path to a device */
+ bdev = lookup_bdev(path);
+ if (IS_ERR(bdev)) {
+ dev = name_to_dev_t(path);
+ if (!dev)
+ return -ENODEV;
+ } else {
+ dev = bdev->bd_dev;
+ bdput(bdev);
+ }
+
+ dd = find_device(&t->devices, dev);
+ if (!dd) {
+ dd = kmalloc(sizeof(*dd), GFP_KERNEL);
+ if (!dd)
+ return -ENOMEM;
+
+ if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) {
+ kfree(dd);
+ return r;
+ }
+
+ atomic_set(&dd->count, 0);
+ list_add(&dd->list, &t->devices);
+
+ } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
+ r = upgrade_mode(dd, mode, t->md);
+ if (r)
+ return r;
+ }
+ atomic_inc(&dd->count);
+
+ *result = dd->dm_dev;
+ return 0;
+}
+EXPORT_SYMBOL(dm_get_device);
+
+static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct queue_limits *limits = data;
+ struct block_device *bdev = dev->bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ char b[BDEVNAME_SIZE];
+
+ if (unlikely(!q)) {
+ DMWARN("%s: Cannot set limits for nonexistent device %s",
+ dm_device_name(ti->table->md), bdevname(bdev, b));
+ return 0;
+ }
+
+ if (bdev_stack_limits(limits, bdev, start) < 0)
+ DMWARN("%s: adding target device %s caused an alignment inconsistency: "
+ "physical_block_size=%u, logical_block_size=%u, "
+ "alignment_offset=%u, start=%llu",
+ dm_device_name(ti->table->md), bdevname(bdev, b),
+ q->limits.physical_block_size,
+ q->limits.logical_block_size,
+ q->limits.alignment_offset,
+ (unsigned long long) start << SECTOR_SHIFT);
+
+ /*
+ * Check if merge fn is supported.
+ * If not we'll force DM to use PAGE_SIZE or
+ * smaller I/O, just to be safe.
+ */
+ if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
+ blk_limits_max_hw_sectors(limits,
+ (unsigned int) (PAGE_SIZE >> 9));
+ return 0;
+}
+
+/*
+ * Decrement a device's use count and remove it if necessary.
+ */
+void dm_put_device(struct dm_target *ti, struct dm_dev *d)
+{
+ int found = 0;
+ struct list_head *devices = &ti->table->devices;
+ struct dm_dev_internal *dd;
+
+ list_for_each_entry(dd, devices, list) {
+ if (dd->dm_dev == d) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ DMWARN("%s: device %s not in table devices list",
+ dm_device_name(ti->table->md), d->name);
+ return;
+ }
+ if (atomic_dec_and_test(&dd->count)) {
+ dm_put_table_device(ti->table->md, d);
+ list_del(&dd->list);
+ kfree(dd);
+ }
+}
+EXPORT_SYMBOL(dm_put_device);
+
+/*
+ * Checks to see if the target joins onto the end of the table.
+ */
+static int adjoin(struct dm_table *table, struct dm_target *ti)
+{
+ struct dm_target *prev;
+
+ if (!table->num_targets)
+ return !ti->begin;
+
+ prev = &table->targets[table->num_targets - 1];
+ return (ti->begin == (prev->begin + prev->len));
+}
+
+/*
+ * Used to dynamically allocate the arg array.
+ *
+ * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
+ * process messages even if some device is suspended. These messages have a
+ * small fixed number of arguments.
+ *
+ * On the other hand, dm-switch needs to process bulk data using messages and
+ * excessive use of GFP_NOIO could cause trouble.
+ */
+static char **realloc_argv(unsigned *array_size, char **old_argv)
+{
+ char **argv;
+ unsigned new_size;
+ gfp_t gfp;
+
+ if (*array_size) {
+ new_size = *array_size * 2;
+ gfp = GFP_KERNEL;
+ } else {
+ new_size = 8;
+ gfp = GFP_NOIO;
+ }
+ argv = kmalloc(new_size * sizeof(*argv), gfp);
+ if (argv) {
+ memcpy(argv, old_argv, *array_size * sizeof(*argv));
+ *array_size = new_size;
+ }
+
+ kfree(old_argv);
+ return argv;
+}
+
+/*
+ * Destructively splits up the argument list to pass to ctr.
+ */
+int dm_split_args(int *argc, char ***argvp, char *input)
+{
+ char *start, *end = input, *out, **argv = NULL;
+ unsigned array_size = 0;
+
+ *argc = 0;
+
+ if (!input) {
+ *argvp = NULL;
+ return 0;
+ }
+
+ argv = realloc_argv(&array_size, argv);
+ if (!argv)
+ return -ENOMEM;
+
+ while (1) {
+ /* Skip whitespace */
+ start = skip_spaces(end);
+
+ if (!*start)
+ break; /* success, we hit the end */
+
+ /* 'out' is used to remove any back-quotes */
+ end = out = start;
+ while (*end) {
+ /* Everything apart from '\0' can be quoted */
+ if (*end == '\\' && *(end + 1)) {
+ *out++ = *(end + 1);
+ end += 2;
+ continue;
+ }
+
+ if (isspace(*end))
+ break; /* end of token */
+
+ *out++ = *end++;
+ }
+
+ /* have we already filled the array ? */
+ if ((*argc + 1) > array_size) {
+ argv = realloc_argv(&array_size, argv);
+ if (!argv)
+ return -ENOMEM;
+ }
+
+ /* we know this is whitespace */
+ if (*end)
+ end++;
+
+ /* terminate the string and put it in the array */
+ *out = '\0';
+ argv[*argc] = start;
+ (*argc)++;
+ }
+
+ *argvp = argv;
+ return 0;
+}
+
+/*
+ * Impose necessary and sufficient conditions on a devices's table such
+ * that any incoming bio which respects its logical_block_size can be
+ * processed successfully. If it falls across the boundary between
+ * two or more targets, the size of each piece it gets split into must
+ * be compatible with the logical_block_size of the target processing it.
+ */
+static int validate_hardware_logical_block_alignment(struct dm_table *table,
+ struct queue_limits *limits)
+{
+ /*
+ * This function uses arithmetic modulo the logical_block_size
+ * (in units of 512-byte sectors).
+ */
+ unsigned short device_logical_block_size_sects =
+ limits->logical_block_size >> SECTOR_SHIFT;
+
+ /*
+ * Offset of the start of the next table entry, mod logical_block_size.
+ */
+ unsigned short next_target_start = 0;
+
+ /*
+ * Given an aligned bio that extends beyond the end of a
+ * target, how many sectors must the next target handle?
+ */
+ unsigned short remaining = 0;
+
+ struct dm_target *uninitialized_var(ti);
+ struct queue_limits ti_limits;
+ unsigned i = 0;
+
+ /*
+ * Check each entry in the table in turn.
+ */
+ while (i < dm_table_get_num_targets(table)) {
+ ti = dm_table_get_target(table, i++);
+
+ blk_set_stacking_limits(&ti_limits);
+
+ /* combine all target devices' limits */
+ if (ti->type->iterate_devices)
+ ti->type->iterate_devices(ti, dm_set_device_limits,
+ &ti_limits);
+
+ /*
+ * If the remaining sectors fall entirely within this
+ * table entry are they compatible with its logical_block_size?
+ */
+ if (remaining < ti->len &&
+ remaining & ((ti_limits.logical_block_size >>
+ SECTOR_SHIFT) - 1))
+ break; /* Error */
+
+ next_target_start =
+ (unsigned short) ((next_target_start + ti->len) &
+ (device_logical_block_size_sects - 1));
+ remaining = next_target_start ?
+ device_logical_block_size_sects - next_target_start : 0;
+ }
+
+ if (remaining) {
+ DMWARN("%s: table line %u (start sect %llu len %llu) "
+ "not aligned to h/w logical block size %u",
+ dm_device_name(table->md), i,
+ (unsigned long long) ti->begin,
+ (unsigned long long) ti->len,
+ limits->logical_block_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int dm_table_add_target(struct dm_table *t, const char *type,
+ sector_t start, sector_t len, char *params)
+{
+ int r = -EINVAL, argc;
+ char **argv;
+ struct dm_target *tgt;
+
+ if (t->singleton) {
+ DMERR("%s: target type %s must appear alone in table",
+ dm_device_name(t->md), t->targets->type->name);
+ return -EINVAL;
+ }
+
+ BUG_ON(t->num_targets >= t->num_allocated);
+
+ tgt = t->targets + t->num_targets;
+ memset(tgt, 0, sizeof(*tgt));
+
+ if (!len) {
+ DMERR("%s: zero-length target", dm_device_name(t->md));
+ return -EINVAL;
+ }
+
+ tgt->type = dm_get_target_type(type);
+ if (!tgt->type) {
+ DMERR("%s: %s: unknown target type", dm_device_name(t->md),
+ type);
+ return -EINVAL;
+ }
+
+ if (dm_target_needs_singleton(tgt->type)) {
+ if (t->num_targets) {
+ DMERR("%s: target type %s must appear alone in table",
+ dm_device_name(t->md), type);
+ return -EINVAL;
+ }
+ t->singleton = 1;
+ }
+
+ if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
+ DMERR("%s: target type %s may not be included in read-only tables",
+ dm_device_name(t->md), type);
+ return -EINVAL;
+ }
+
+ if (t->immutable_target_type) {
+ if (t->immutable_target_type != tgt->type) {
+ DMERR("%s: immutable target type %s cannot be mixed with other target types",
+ dm_device_name(t->md), t->immutable_target_type->name);
+ return -EINVAL;
+ }
+ } else if (dm_target_is_immutable(tgt->type)) {
+ if (t->num_targets) {
+ DMERR("%s: immutable target type %s cannot be mixed with other target types",
+ dm_device_name(t->md), tgt->type->name);
+ return -EINVAL;
+ }
+ t->immutable_target_type = tgt->type;
+ }
+
+ tgt->table = t;
+ tgt->begin = start;
+ tgt->len = len;
+ tgt->error = "Unknown error";
+
+ /*
+ * Does this target adjoin the previous one ?
+ */
+ if (!adjoin(t, tgt)) {
+ tgt->error = "Gap in table";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ r = dm_split_args(&argc, &argv, params);
+ if (r) {
+ tgt->error = "couldn't split parameters (insufficient memory)";
+ goto bad;
+ }
+
+ r = tgt->type->ctr(tgt, argc, argv);
+ kfree(argv);
+ if (r)
+ goto bad;
+
+ t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
+
+ if (!tgt->num_discard_bios && tgt->discards_supported)
+ DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.",
+ dm_device_name(t->md), type);
+
+ return 0;
+
+ bad:
+ DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
+ dm_put_target_type(tgt->type);
+ return r;
+}
+
+/*
+ * Target argument parsing helpers.
+ */
+static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+ unsigned *value, char **error, unsigned grouped)
+{
+ const char *arg_str = dm_shift_arg(arg_set);
+ char dummy;
+
+ if (!arg_str ||
+ (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
+ (*value < arg->min) ||
+ (*value > arg->max) ||
+ (grouped && arg_set->argc < *value)) {
+ *error = arg->error;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+ unsigned *value, char **error)
+{
+ return validate_next_arg(arg, arg_set, value, error, 0);
+}
+EXPORT_SYMBOL(dm_read_arg);
+
+int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+ unsigned *value, char **error)
+{
+ return validate_next_arg(arg, arg_set, value, error, 1);
+}
+EXPORT_SYMBOL(dm_read_arg_group);
+
+const char *dm_shift_arg(struct dm_arg_set *as)
+{
+ char *r;
+
+ if (as->argc) {
+ as->argc--;
+ r = *as->argv;
+ as->argv++;
+ return r;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(dm_shift_arg);
+
+void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
+{
+ BUG_ON(as->argc < num_args);
+ as->argc -= num_args;
+ as->argv += num_args;
+}
+EXPORT_SYMBOL(dm_consume_args);
+
+static bool __table_type_request_based(unsigned table_type)
+{
+ return (table_type == DM_TYPE_REQUEST_BASED ||
+ table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
+static int dm_table_set_type(struct dm_table *t)
+{
+ unsigned i;
+ unsigned bio_based = 0, request_based = 0, hybrid = 0;
+ bool use_blk_mq = false;
+ struct dm_target *tgt;
+ struct dm_dev_internal *dd;
+ struct list_head *devices;
+ unsigned live_md_type = dm_get_md_type(t->md);
+
+ for (i = 0; i < t->num_targets; i++) {
+ tgt = t->targets + i;
+ if (dm_target_hybrid(tgt))
+ hybrid = 1;
+ else if (dm_target_request_based(tgt))
+ request_based = 1;
+ else
+ bio_based = 1;
+
+ if (bio_based && request_based) {
+ DMWARN("Inconsistent table: different target types"
+ " can't be mixed up");
+ return -EINVAL;
+ }
+ }
+
+ if (hybrid && !bio_based && !request_based) {
+ /*
+ * The targets can work either way.
+ * Determine the type from the live device.
+ * Default to bio-based if device is new.
+ */
+ if (__table_type_request_based(live_md_type))
+ request_based = 1;
+ else
+ bio_based = 1;
+ }
+
+ if (bio_based) {
+ /* We must use this table as bio-based */
+ t->type = DM_TYPE_BIO_BASED;
+ return 0;
+ }
+
+ BUG_ON(!request_based); /* No targets in this table */
+
+ /*
+ * Request-based dm supports only tables that have a single target now.
+ * To support multiple targets, request splitting support is needed,
+ * and that needs lots of changes in the block-layer.
+ * (e.g. request completion process for partial completion.)
+ */
+ if (t->num_targets > 1) {
+ DMWARN("Request-based dm doesn't support multiple targets yet");
+ return -EINVAL;
+ }
+
+ /* Non-request-stackable devices can't be used for request-based dm */
+ devices = dm_table_get_devices(t);
+ list_for_each_entry(dd, devices, list) {
+ struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
+
+ if (!blk_queue_stackable(q)) {
+ DMERR("table load rejected: including"
+ " non-request-stackable devices");
+ return -EINVAL;
+ }
+
+ if (q->mq_ops)
+ use_blk_mq = true;
+ }
+
+ if (use_blk_mq) {
+ /* verify _all_ devices in the table are blk-mq devices */
+ list_for_each_entry(dd, devices, list)
+ if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
+ DMERR("table load rejected: not all devices"
+ " are blk-mq request-stackable");
+ return -EINVAL;
+ }
+ t->type = DM_TYPE_MQ_REQUEST_BASED;
+
+ } else if (list_empty(devices) && __table_type_request_based(live_md_type)) {
+ /* inherit live MD type */
+ t->type = live_md_type;
+
+ } else
+ t->type = DM_TYPE_REQUEST_BASED;
+
+ return 0;
+}
+
+unsigned dm_table_get_type(struct dm_table *t)
+{
+ return t->type;
+}
+
+struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
+{
+ return t->immutable_target_type;
+}
+
+bool dm_table_request_based(struct dm_table *t)
+{
+ return __table_type_request_based(dm_table_get_type(t));
+}
+
+bool dm_table_mq_request_based(struct dm_table *t)
+{
+ return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
+}
+
+static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
+{
+ unsigned type = dm_table_get_type(t);
+ unsigned per_bio_data_size = 0;
+ struct dm_target *tgt;
+ unsigned i;
+
+ if (unlikely(type == DM_TYPE_NONE)) {
+ DMWARN("no table type is set, can't allocate mempools");
+ return -EINVAL;
+ }
+
+ if (type == DM_TYPE_BIO_BASED)
+ for (i = 0; i < t->num_targets; i++) {
+ tgt = t->targets + i;
+ per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
+ }
+
+ t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
+ if (!t->mempools)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void dm_table_free_md_mempools(struct dm_table *t)
+{
+ dm_free_md_mempools(t->mempools);
+ t->mempools = NULL;
+}
+
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
+{
+ return t->mempools;
+}
+
+static int setup_indexes(struct dm_table *t)
+{
+ int i;
+ unsigned int total = 0;
+ sector_t *indexes;
+
+ /* allocate the space for *all* the indexes */
+ for (i = t->depth - 2; i >= 0; i--) {
+ t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
+ total += t->counts[i];
+ }
+
+ indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
+ if (!indexes)
+ return -ENOMEM;
+
+ /* set up internal nodes, bottom-up */
+ for (i = t->depth - 2; i >= 0; i--) {
+ t->index[i] = indexes;
+ indexes += (KEYS_PER_NODE * t->counts[i]);
+ setup_btree_index(i, t);
+ }
+
+ return 0;
+}
+
+/*
+ * Builds the btree to index the map.
+ */
+static int dm_table_build_index(struct dm_table *t)
+{
+ int r = 0;
+ unsigned int leaf_nodes;
+
+ /* how many indexes will the btree have ? */
+ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
+ t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
+
+ /* leaf layer has already been set up */
+ t->counts[t->depth - 1] = leaf_nodes;
+ t->index[t->depth - 1] = t->highs;
+
+ if (t->depth >= 2)
+ r = setup_indexes(t);
+
+ return r;
+}
+
+/*
+ * Get a disk whose integrity profile reflects the table's profile.
+ * If %match_all is true, all devices' profiles must match.
+ * If %match_all is false, all devices must at least have an
+ * allocated integrity profile; but uninitialized is ok.
+ * Returns NULL if integrity support was inconsistent or unavailable.
+ */
+static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
+ bool match_all)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd = NULL;
+ struct gendisk *prev_disk = NULL, *template_disk = NULL;
+
+ list_for_each_entry(dd, devices, list) {
+ template_disk = dd->dm_dev->bdev->bd_disk;
+ if (!blk_get_integrity(template_disk))
+ goto no_integrity;
+ if (!match_all && !blk_integrity_is_initialized(template_disk))
+ continue; /* skip uninitialized profiles */
+ else if (prev_disk &&
+ blk_integrity_compare(prev_disk, template_disk) < 0)
+ goto no_integrity;
+ prev_disk = template_disk;
+ }
+
+ return template_disk;
+
+no_integrity:
+ if (prev_disk)
+ DMWARN("%s: integrity not set: %s and %s profile mismatch",
+ dm_device_name(t->md),
+ prev_disk->disk_name,
+ template_disk->disk_name);
+ return NULL;
+}
+
+/*
+ * Register the mapped device for blk_integrity support if
+ * the underlying devices have an integrity profile. But all devices
+ * may not have matching profiles (checking all devices isn't reliable
+ * during table load because this table may use other DM device(s) which
+ * must be resumed before they will have an initialized integity profile).
+ * Stacked DM devices force a 2 stage integrity profile validation:
+ * 1 - during load, validate all initialized integrity profiles match
+ * 2 - during resume, validate all integrity profiles match
+ */
+static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
+{
+ struct gendisk *template_disk = NULL;
+
+ template_disk = dm_table_get_integrity_disk(t, false);
+ if (!template_disk)
+ return 0;
+
+ if (!blk_integrity_is_initialized(dm_disk(md))) {
+ t->integrity_supported = 1;
+ return blk_integrity_register(dm_disk(md), NULL);
+ }
+
+ /*
+ * If DM device already has an initalized integrity
+ * profile the new profile should not conflict.
+ */
+ if (blk_integrity_is_initialized(template_disk) &&
+ blk_integrity_compare(dm_disk(md), template_disk) < 0) {
+ DMWARN("%s: conflict with existing integrity profile: "
+ "%s profile mismatch",
+ dm_device_name(t->md),
+ template_disk->disk_name);
+ return 1;
+ }
+
+ /* Preserve existing initialized integrity profile */
+ t->integrity_supported = 1;
+ return 0;
+}
+
+/*
+ * Prepares the table for use by building the indices,
+ * setting the type, and allocating mempools.
+ */
+int dm_table_complete(struct dm_table *t)
+{
+ int r;
+
+ r = dm_table_set_type(t);
+ if (r) {
+ DMERR("unable to set table type");
+ return r;
+ }
+
+ r = dm_table_build_index(t);
+ if (r) {
+ DMERR("unable to build btrees");
+ return r;
+ }
+
+ r = dm_table_prealloc_integrity(t, t->md);
+ if (r) {
+ DMERR("could not register integrity profile.");
+ return r;
+ }
+
+ r = dm_table_alloc_md_mempools(t, t->md);
+ if (r)
+ DMERR("unable to allocate mempools");
+
+ return r;
+}
+
+static DEFINE_MUTEX(_event_lock);
+void dm_table_event_callback(struct dm_table *t,
+ void (*fn)(void *), void *context)
+{
+ mutex_lock(&_event_lock);
+ t->event_fn = fn;
+ t->event_context = context;
+ mutex_unlock(&_event_lock);
+}
+
+void dm_table_event(struct dm_table *t)
+{
+ /*
+ * You can no longer call dm_table_event() from interrupt
+ * context, use a bottom half instead.
+ */
+ BUG_ON(in_interrupt());
+
+ mutex_lock(&_event_lock);
+ if (t->event_fn)
+ t->event_fn(t->event_context);
+ mutex_unlock(&_event_lock);
+}
+EXPORT_SYMBOL(dm_table_event);
+
+sector_t dm_table_get_size(struct dm_table *t)
+{
+ return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
+}
+EXPORT_SYMBOL(dm_table_get_size);
+
+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
+{
+ if (index >= t->num_targets)
+ return NULL;
+
+ return t->targets + index;
+}
+
+/*
+ * Search the btree for the correct target.
+ *
+ * Caller should check returned pointer with dm_target_is_valid()
+ * to trap I/O beyond end of device.
+ */
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
+{
+ unsigned int l, n = 0, k = 0;
+ sector_t *node;
+
+ for (l = 0; l < t->depth; l++) {
+ n = get_child(n, k);
+ node = get_node(t, l, n);
+
+ for (k = 0; k < KEYS_PER_NODE; k++)
+ if (node[k] >= sector)
+ break;
+ }
+
+ return &t->targets[(KEYS_PER_NODE * n) + k];
+}
+
+static int count_device(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ unsigned *num_devices = data;
+
+ (*num_devices)++;
+
+ return 0;
+}
+
+/*
+ * Check whether a table has no data devices attached using each
+ * target's iterate_devices method.
+ * Returns false if the result is unknown because a target doesn't
+ * support iterate_devices.
+ */
+bool dm_table_has_no_data_devices(struct dm_table *table)
+{
+ struct dm_target *uninitialized_var(ti);
+ unsigned i = 0, num_devices = 0;
+
+ while (i < dm_table_get_num_targets(table)) {
+ ti = dm_table_get_target(table, i++);
+
+ if (!ti->type->iterate_devices)
+ return false;
+
+ ti->type->iterate_devices(ti, count_device, &num_devices);
+ if (num_devices)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Establish the new table's queue_limits and validate them.
+ */
+int dm_calculate_queue_limits(struct dm_table *table,
+ struct queue_limits *limits)
+{
+ struct dm_target *uninitialized_var(ti);
+ struct queue_limits ti_limits;
+ unsigned i = 0;
+
+ blk_set_stacking_limits(limits);
+
+ while (i < dm_table_get_num_targets(table)) {
+ blk_set_stacking_limits(&ti_limits);
+
+ ti = dm_table_get_target(table, i++);
+
+ if (!ti->type->iterate_devices)
+ goto combine_limits;
+
+ /*
+ * Combine queue limits of all the devices this target uses.
+ */
+ ti->type->iterate_devices(ti, dm_set_device_limits,
+ &ti_limits);
+
+ /* Set I/O hints portion of queue limits */
+ if (ti->type->io_hints)
+ ti->type->io_hints(ti, &ti_limits);
+
+ /*
+ * Check each device area is consistent with the target's
+ * overall queue limits.
+ */
+ if (ti->type->iterate_devices(ti, device_area_is_invalid,
+ &ti_limits))
+ return -EINVAL;
+
+combine_limits:
+ /*
+ * Merge this target's queue limits into the overall limits
+ * for the table.
+ */
+ if (blk_stack_limits(limits, &ti_limits, 0) < 0)
+ DMWARN("%s: adding target device "
+ "(start sect %llu len %llu) "
+ "caused an alignment inconsistency",
+ dm_device_name(table->md),
+ (unsigned long long) ti->begin,
+ (unsigned long long) ti->len);
+ }
+
+ return validate_hardware_logical_block_alignment(table, limits);
+}
+
+/*
+ * Set the integrity profile for this device if all devices used have
+ * matching profiles. We're quite deep in the resume path but still
+ * don't know if all devices (particularly DM devices this device
+ * may be stacked on) have matching profiles. Even if the profiles
+ * don't match we have no way to fail (to resume) at this point.
+ */
+static void dm_table_set_integrity(struct dm_table *t)
+{
+ struct gendisk *template_disk = NULL;
+
+ if (!blk_get_integrity(dm_disk(t->md)))
+ return;
+
+ template_disk = dm_table_get_integrity_disk(t, true);
+ if (template_disk)
+ blk_integrity_register(dm_disk(t->md),
+ blk_get_integrity(template_disk));
+ else if (blk_integrity_is_initialized(dm_disk(t->md)))
+ DMWARN("%s: device no longer has a valid integrity profile",
+ dm_device_name(t->md));
+ else
+ DMWARN("%s: unable to establish an integrity profile",
+ dm_device_name(t->md));
+}
+
+static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ unsigned flush = (*(unsigned *)data);
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && (q->flush_flags & flush);
+}
+
+static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ /*
+ * Require at least one underlying device to support flushes.
+ * t->devices includes internal dm devices such as mirror logs
+ * so we need to use iterate_devices here, which targets
+ * supporting flushes must provide.
+ */
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_flush_bios)
+ continue;
+
+ if (ti->flush_supported)
+ return true;
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, device_flush_capable, &flush))
+ return true;
+ }
+
+ return false;
+}
+
+static bool dm_table_discard_zeroes_data(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ /* Ensure that all targets supports discard_zeroes_data. */
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (ti->discard_zeroes_data_unsupported)
+ return false;
+ }
+
+ return true;
+}
+
+static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && blk_queue_nonrot(q);
+}
+
+static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !blk_queue_add_random(q);
+}
+
+static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+}
+
+static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags);
+}
+
+static bool dm_table_all_devices_attribute(struct dm_table *t,
+ iterate_devices_callout_fn func)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->type->iterate_devices ||
+ !ti->type->iterate_devices(ti, func, NULL))
+ return false;
+ }
+
+ return true;
+}
+
+static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !q->limits.max_write_same_sectors;
+}
+
+static bool dm_table_supports_write_same(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_write_same_bios)
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
+ return false;
+ }
+
+ return true;
+}
+
+static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && blk_queue_discard(q);
+}
+
+static bool dm_table_supports_discards(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ /*
+ * Unless any target used by the table set discards_supported,
+ * require at least one underlying device to support discards.
+ * t->devices includes internal dm devices such as mirror logs
+ * so we need to use iterate_devices here, which targets
+ * supporting discard selectively must provide.
+ */
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_discard_bios)
+ continue;
+
+ if (ti->discards_supported)
+ return true;
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, device_discard_capable, NULL))
+ return true;
+ }
+
+ return false;
+}
+
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+ struct queue_limits *limits)
+{
+ unsigned flush = 0;
+
+ /*
+ * Copy table's limits to the DM device's request_queue
+ */
+ q->limits = *limits;
+
+ if (!dm_table_supports_discards(t))
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+
+ if (dm_table_supports_flush(t, REQ_FLUSH)) {
+ flush |= REQ_FLUSH;
+ if (dm_table_supports_flush(t, REQ_FUA))
+ flush |= REQ_FUA;
+ }
+ blk_queue_flush(q, flush);
+
+ if (!dm_table_discard_zeroes_data(t))
+ q->limits.discard_zeroes_data = 0;
+
+ /* Ensure that all underlying devices are non-rotational. */
+ if (dm_table_all_devices_attribute(t, device_is_nonrot))
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
+
+ if (!dm_table_supports_write_same(t))
+ q->limits.max_write_same_sectors = 0;
+
+ if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
+ queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+
+ if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps))
+ queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q);
+
+ dm_table_set_integrity(t);
+
+ /*
+ * Determine whether or not this queue's I/O timings contribute
+ * to the entropy pool, Only request-based targets use this.
+ * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
+ * have it set.
+ */
+ if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
+ queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
+
+ /*
+ * QUEUE_FLAG_STACKABLE must be set after all queue settings are
+ * visible to other CPUs because, once the flag is set, incoming bios
+ * are processed by request-based dm, which refers to the queue
+ * settings.
+ * Until the flag set, bios are passed to bio-based dm and queued to
+ * md->deferred where queue settings are not needed yet.
+ * Those bios are passed to request-based dm at the resume time.
+ */
+ smp_mb();
+ if (dm_table_request_based(t))
+ queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
+}
+
+unsigned int dm_table_get_num_targets(struct dm_table *t)
+{
+ return t->num_targets;
+}
+
+struct list_head *dm_table_get_devices(struct dm_table *t)
+{
+ return &t->devices;
+}
+
+fmode_t dm_table_get_mode(struct dm_table *t)
+{
+ return t->mode;
+}
+EXPORT_SYMBOL(dm_table_get_mode);
+
+enum suspend_mode {
+ PRESUSPEND,
+ PRESUSPEND_UNDO,
+ POSTSUSPEND,
+};
+
+static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
+{
+ int i = t->num_targets;
+ struct dm_target *ti = t->targets;
+
+ while (i--) {
+ switch (mode) {
+ case PRESUSPEND:
+ if (ti->type->presuspend)
+ ti->type->presuspend(ti);
+ break;
+ case PRESUSPEND_UNDO:
+ if (ti->type->presuspend_undo)
+ ti->type->presuspend_undo(ti);
+ break;
+ case POSTSUSPEND:
+ if (ti->type->postsuspend)
+ ti->type->postsuspend(ti);
+ break;
+ }
+ ti++;
+ }
+}
+
+void dm_table_presuspend_targets(struct dm_table *t)
+{
+ if (!t)
+ return;
+
+ suspend_targets(t, PRESUSPEND);
+}
+
+void dm_table_presuspend_undo_targets(struct dm_table *t)
+{
+ if (!t)
+ return;
+
+ suspend_targets(t, PRESUSPEND_UNDO);
+}
+
+void dm_table_postsuspend_targets(struct dm_table *t)
+{
+ if (!t)
+ return;
+
+ suspend_targets(t, POSTSUSPEND);
+}
+
+int dm_table_resume_targets(struct dm_table *t)
+{
+ int i, r = 0;
+
+ for (i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = t->targets + i;
+
+ if (!ti->type->preresume)
+ continue;
+
+ r = ti->type->preresume(ti);
+ if (r) {
+ DMERR("%s: %s: preresume failed, error = %d",
+ dm_device_name(t->md), ti->type->name, r);
+ return r;
+ }
+ }
+
+ for (i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = t->targets + i;
+
+ if (ti->type->resume)
+ ti->type->resume(ti);
+ }
+
+ return 0;
+}
+
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+ list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
+
+int dm_table_any_congested(struct dm_table *t, int bdi_bits)
+{
+ struct dm_dev_internal *dd;
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_target_callbacks *cb;
+ int r = 0;
+
+ list_for_each_entry(dd, devices, list) {
+ struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
+ char b[BDEVNAME_SIZE];
+
+ if (likely(q))
+ r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+ else
+ DMWARN_LIMIT("%s: any_congested: nonexistent device %s",
+ dm_device_name(t->md),
+ bdevname(dd->dm_dev->bdev, b));
+ }
+
+ list_for_each_entry(cb, &t->target_callbacks, list)
+ if (cb->congested_fn)
+ r |= cb->congested_fn(cb, bdi_bits);
+
+ return r;
+}
+
+struct mapped_device *dm_table_get_md(struct dm_table *t)
+{
+ return t->md;
+}
+EXPORT_SYMBOL(dm_table_get_md);
+
+void dm_table_run_md_queue_async(struct dm_table *t)
+{
+ struct mapped_device *md;
+ struct request_queue *queue;
+ unsigned long flags;
+
+ if (!dm_table_request_based(t))
+ return;
+
+ md = dm_table_get_md(t);
+ queue = dm_get_md_queue(md);
+ if (queue) {
+ if (queue->mq_ops)
+ blk_mq_run_hw_queues(queue, true);
+ else {
+ spin_lock_irqsave(queue->queue_lock, flags);
+ blk_run_queue_async(queue);
+ spin_unlock_irqrestore(queue->queue_lock, flags);
+ }
+ }
+}
+EXPORT_SYMBOL(dm_table_run_md_queue_async);
+
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
new file mode 100644
index 000000000..925ec1b15
--- /dev/null
+++ b/drivers/md/dm-target.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/bio.h>
+
+#define DM_MSG_PREFIX "target"
+
+static LIST_HEAD(_targets);
+static DECLARE_RWSEM(_lock);
+
+#define DM_MOD_NAME_SIZE 32
+
+static inline struct target_type *__find_target_type(const char *name)
+{
+ struct target_type *tt;
+
+ list_for_each_entry(tt, &_targets, list)
+ if (!strcmp(name, tt->name))
+ return tt;
+
+ return NULL;
+}
+
+static struct target_type *get_target_type(const char *name)
+{
+ struct target_type *tt;
+
+ down_read(&_lock);
+
+ tt = __find_target_type(name);
+ if (tt && !try_module_get(tt->module))
+ tt = NULL;
+
+ up_read(&_lock);
+ return tt;
+}
+
+static void load_module(const char *name)
+{
+ request_module("dm-%s", name);
+}
+
+struct target_type *dm_get_target_type(const char *name)
+{
+ struct target_type *tt = get_target_type(name);
+
+ if (!tt) {
+ load_module(name);
+ tt = get_target_type(name);
+ }
+
+ return tt;
+}
+
+void dm_put_target_type(struct target_type *tt)
+{
+ down_read(&_lock);
+ module_put(tt->module);
+ up_read(&_lock);
+}
+
+int dm_target_iterate(void (*iter_func)(struct target_type *tt,
+ void *param), void *param)
+{
+ struct target_type *tt;
+
+ down_read(&_lock);
+ list_for_each_entry(tt, &_targets, list)
+ iter_func(tt, param);
+ up_read(&_lock);
+
+ return 0;
+}
+
+int dm_register_target(struct target_type *tt)
+{
+ int rv = 0;
+
+ down_write(&_lock);
+ if (__find_target_type(tt->name))
+ rv = -EEXIST;
+ else
+ list_add(&tt->list, &_targets);
+
+ up_write(&_lock);
+ return rv;
+}
+
+void dm_unregister_target(struct target_type *tt)
+{
+ down_write(&_lock);
+ if (!__find_target_type(tt->name)) {
+ DMCRIT("Unregistering unrecognised target: %s", tt->name);
+ BUG();
+ }
+
+ list_del(&tt->list);
+
+ up_write(&_lock);
+}
+
+/*
+ * io-err: always fails an io, useful for bringing
+ * up LVs that have holes in them.
+ */
+static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
+{
+ /*
+ * Return error for discards instead of -EOPNOTSUPP
+ */
+ tt->num_discard_bios = 1;
+
+ return 0;
+}
+
+static void io_err_dtr(struct dm_target *tt)
+{
+ /* empty */
+}
+
+static int io_err_map(struct dm_target *tt, struct bio *bio)
+{
+ return -EIO;
+}
+
+static int io_err_map_rq(struct dm_target *ti, struct request *clone,
+ union map_info *map_context)
+{
+ return -EIO;
+}
+
+static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **clone)
+{
+ return -EIO;
+}
+
+static void io_err_release_clone_rq(struct request *clone)
+{
+}
+
+static struct target_type error_target = {
+ .name = "error",
+ .version = {1, 3, 0},
+ .ctr = io_err_ctr,
+ .dtr = io_err_dtr,
+ .map = io_err_map,
+ .map_rq = io_err_map_rq,
+ .clone_and_map_rq = io_err_clone_and_map_rq,
+ .release_clone_rq = io_err_release_clone_rq,
+};
+
+int __init dm_target_init(void)
+{
+ return dm_register_target(&error_target);
+}
+
+void dm_target_exit(void)
+{
+ dm_unregister_target(&error_target);
+}
+
+EXPORT_SYMBOL(dm_register_target);
+EXPORT_SYMBOL(dm_unregister_target);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
new file mode 100644
index 000000000..79f694120
--- /dev/null
+++ b/drivers/md/dm-thin-metadata.c
@@ -0,0 +1,1807 @@
+/*
+ * Copyright (C) 2011-2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-thin-metadata.h"
+#include "persistent-data/dm-btree.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-disk.h"
+#include "persistent-data/dm-transaction-manager.h"
+
+#include <linux/list.h>
+#include <linux/device-mapper.h>
+#include <linux/workqueue.h>
+
+/*--------------------------------------------------------------------------
+ * As far as the metadata goes, there is:
+ *
+ * - A superblock in block zero, taking up fewer than 512 bytes for
+ * atomic writes.
+ *
+ * - A space map managing the metadata blocks.
+ *
+ * - A space map managing the data blocks.
+ *
+ * - A btree mapping our internal thin dev ids onto struct disk_device_details.
+ *
+ * - A hierarchical btree, with 2 levels which effectively maps (thin
+ * dev id, virtual block) -> block_time. Block time is a 64-bit
+ * field holding the time in the low 24 bits, and block in the top 48
+ * bits.
+ *
+ * BTrees consist solely of btree_nodes, that fill a block. Some are
+ * internal nodes, as such their values are a __le64 pointing to other
+ * nodes. Leaf nodes can store data of any reasonable size (ie. much
+ * smaller than the block size). The nodes consist of the header,
+ * followed by an array of keys, followed by an array of values. We have
+ * to binary search on the keys so they're all held together to help the
+ * cpu cache.
+ *
+ * Space maps have 2 btrees:
+ *
+ * - One maps a uint64_t onto a struct index_entry. Which points to a
+ * bitmap block, and has some details about how many free entries there
+ * are etc.
+ *
+ * - The bitmap blocks have a header (for the checksum). Then the rest
+ * of the block is pairs of bits. With the meaning being:
+ *
+ * 0 - ref count is 0
+ * 1 - ref count is 1
+ * 2 - ref count is 2
+ * 3 - ref count is higher than 2
+ *
+ * - If the count is higher than 2 then the ref count is entered in a
+ * second btree that directly maps the block_address to a uint32_t ref
+ * count.
+ *
+ * The space map metadata variant doesn't have a bitmaps btree. Instead
+ * it has one single blocks worth of index_entries. This avoids
+ * recursive issues with the bitmap btree needing to allocate space in
+ * order to insert. With a small data block size such as 64k the
+ * metadata support data devices that are hundreds of terrabytes.
+ *
+ * The space maps allocate space linearly from front to back. Space that
+ * is freed in a transaction is never recycled within that transaction.
+ * To try and avoid fragmenting _free_ space the allocator always goes
+ * back and fills in gaps.
+ *
+ * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
+ * from the block manager.
+ *--------------------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "thin metadata"
+
+#define THIN_SUPERBLOCK_MAGIC 27022010
+#define THIN_SUPERBLOCK_LOCATION 0
+#define THIN_VERSION 2
+#define THIN_METADATA_CACHE_SIZE 64
+#define SECTOR_TO_BLOCK_SHIFT 3
+
+/*
+ * 3 for btree insert +
+ * 2 for btree lookup used within space map
+ */
+#define THIN_MAX_CONCURRENT_LOCKS 5
+
+/* This should be plenty */
+#define SPACE_MAP_ROOT_SIZE 128
+
+/*
+ * Little endian on-disk superblock and device details.
+ */
+struct thin_disk_superblock {
+ __le32 csum; /* Checksum of superblock except for this field. */
+ __le32 flags;
+ __le64 blocknr; /* This block number, dm_block_t. */
+
+ __u8 uuid[16];
+ __le64 magic;
+ __le32 version;
+ __le32 time;
+
+ __le64 trans_id;
+
+ /*
+ * Root held by userspace transactions.
+ */
+ __le64 held_root;
+
+ __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+ /*
+ * 2-level btree mapping (dev_id, (dev block, time)) -> data block
+ */
+ __le64 data_mapping_root;
+
+ /*
+ * Device detail root mapping dev_id -> device_details
+ */
+ __le64 device_details_root;
+
+ __le32 data_block_size; /* In 512-byte sectors. */
+
+ __le32 metadata_block_size; /* In 512-byte sectors. */
+ __le64 metadata_nr_blocks;
+
+ __le32 compat_flags;
+ __le32 compat_ro_flags;
+ __le32 incompat_flags;
+} __packed;
+
+struct disk_device_details {
+ __le64 mapped_blocks;
+ __le64 transaction_id; /* When created. */
+ __le32 creation_time;
+ __le32 snapshotted_time;
+} __packed;
+
+struct dm_pool_metadata {
+ struct hlist_node hash;
+
+ struct block_device *bdev;
+ struct dm_block_manager *bm;
+ struct dm_space_map *metadata_sm;
+ struct dm_space_map *data_sm;
+ struct dm_transaction_manager *tm;
+ struct dm_transaction_manager *nb_tm;
+
+ /*
+ * Two-level btree.
+ * First level holds thin_dev_t.
+ * Second level holds mappings.
+ */
+ struct dm_btree_info info;
+
+ /*
+ * Non-blocking version of the above.
+ */
+ struct dm_btree_info nb_info;
+
+ /*
+ * Just the top level for deleting whole devices.
+ */
+ struct dm_btree_info tl_info;
+
+ /*
+ * Just the bottom level for creating new devices.
+ */
+ struct dm_btree_info bl_info;
+
+ /*
+ * Describes the device details btree.
+ */
+ struct dm_btree_info details_info;
+
+ struct rw_semaphore root_lock;
+ uint32_t time;
+ dm_block_t root;
+ dm_block_t details_root;
+ struct list_head thin_devices;
+ uint64_t trans_id;
+ unsigned long flags;
+ sector_t data_block_size;
+ bool read_only:1;
+
+ /*
+ * Set if a transaction has to be aborted but the attempt to roll back
+ * to the previous (good) transaction failed. The only pool metadata
+ * operation possible in this state is the closing of the device.
+ */
+ bool fail_io:1;
+
+ /*
+ * Reading the space map roots can fail, so we read it into these
+ * buffers before the superblock is locked and updated.
+ */
+ __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+};
+
+struct dm_thin_device {
+ struct list_head list;
+ struct dm_pool_metadata *pmd;
+ dm_thin_id id;
+
+ int open_count;
+ bool changed:1;
+ bool aborted_with_changes:1;
+ uint64_t mapped_blocks;
+ uint64_t transaction_id;
+ uint32_t creation_time;
+ uint32_t snapshotted_time;
+};
+
+/*----------------------------------------------------------------
+ * superblock validator
+ *--------------------------------------------------------------*/
+
+#define SUPERBLOCK_CSUM_XOR 160774
+
+static void sb_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size)
+{
+ struct thin_disk_superblock *disk_super = dm_block_data(b);
+
+ disk_super->blocknr = cpu_to_le64(dm_block_location(b));
+ disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+ block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR));
+}
+
+static int sb_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size)
+{
+ struct thin_disk_superblock *disk_super = dm_block_data(b);
+ __le32 csum_le;
+
+ if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
+ DMERR("sb_check failed: blocknr %llu: "
+ "wanted %llu", le64_to_cpu(disk_super->blocknr),
+ (unsigned long long)dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
+ DMERR("sb_check failed: magic %llu: "
+ "wanted %llu", le64_to_cpu(disk_super->magic),
+ (unsigned long long)THIN_SUPERBLOCK_MAGIC);
+ return -EILSEQ;
+ }
+
+ csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+ block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR));
+ if (csum_le != disk_super->csum) {
+ DMERR("sb_check failed: csum %u: wanted %u",
+ le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+static struct dm_block_validator sb_validator = {
+ .name = "superblock",
+ .prepare_for_write = sb_prepare_for_write,
+ .check = sb_check
+};
+
+/*----------------------------------------------------------------
+ * Methods for the btree value types
+ *--------------------------------------------------------------*/
+
+static uint64_t pack_block_time(dm_block_t b, uint32_t t)
+{
+ return (b << 24) | t;
+}
+
+static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
+{
+ *b = v >> 24;
+ *t = v & ((1 << 24) - 1);
+}
+
+static void data_block_inc(void *context, const void *value_le)
+{
+ struct dm_space_map *sm = context;
+ __le64 v_le;
+ uint64_t b;
+ uint32_t t;
+
+ memcpy(&v_le, value_le, sizeof(v_le));
+ unpack_block_time(le64_to_cpu(v_le), &b, &t);
+ dm_sm_inc_block(sm, b);
+}
+
+static void data_block_dec(void *context, const void *value_le)
+{
+ struct dm_space_map *sm = context;
+ __le64 v_le;
+ uint64_t b;
+ uint32_t t;
+
+ memcpy(&v_le, value_le, sizeof(v_le));
+ unpack_block_time(le64_to_cpu(v_le), &b, &t);
+ dm_sm_dec_block(sm, b);
+}
+
+static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
+{
+ __le64 v1_le, v2_le;
+ uint64_t b1, b2;
+ uint32_t t;
+
+ memcpy(&v1_le, value1_le, sizeof(v1_le));
+ memcpy(&v2_le, value2_le, sizeof(v2_le));
+ unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
+ unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
+
+ return b1 == b2;
+}
+
+static void subtree_inc(void *context, const void *value)
+{
+ struct dm_btree_info *info = context;
+ __le64 root_le;
+ uint64_t root;
+
+ memcpy(&root_le, value, sizeof(root_le));
+ root = le64_to_cpu(root_le);
+ dm_tm_inc(info->tm, root);
+}
+
+static void subtree_dec(void *context, const void *value)
+{
+ struct dm_btree_info *info = context;
+ __le64 root_le;
+ uint64_t root;
+
+ memcpy(&root_le, value, sizeof(root_le));
+ root = le64_to_cpu(root_le);
+ if (dm_btree_del(info, root))
+ DMERR("btree delete failed\n");
+}
+
+static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
+{
+ __le64 v1_le, v2_le;
+ memcpy(&v1_le, value1_le, sizeof(v1_le));
+ memcpy(&v2_le, value2_le, sizeof(v2_le));
+
+ return v1_le == v2_le;
+}
+
+/*----------------------------------------------------------------*/
+
+static int superblock_lock_zero(struct dm_pool_metadata *pmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+static int superblock_lock(struct dm_pool_metadata *pmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ &sb_validator, sblock);
+}
+
+static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+{
+ int r;
+ unsigned i;
+ struct dm_block *b;
+ __le64 *data_le, zero = cpu_to_le64(0);
+ unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
+
+ /*
+ * We can't use a validator here - it may be all zeroes.
+ */
+ r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
+ if (r)
+ return r;
+
+ data_le = dm_block_data(b);
+ *result = 1;
+ for (i = 0; i < block_size; i++) {
+ if (data_le[i] != zero) {
+ *result = 0;
+ break;
+ }
+ }
+
+ return dm_bm_unlock(b);
+}
+
+static void __setup_btree_details(struct dm_pool_metadata *pmd)
+{
+ pmd->info.tm = pmd->tm;
+ pmd->info.levels = 2;
+ pmd->info.value_type.context = pmd->data_sm;
+ pmd->info.value_type.size = sizeof(__le64);
+ pmd->info.value_type.inc = data_block_inc;
+ pmd->info.value_type.dec = data_block_dec;
+ pmd->info.value_type.equal = data_block_equal;
+
+ memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
+ pmd->nb_info.tm = pmd->nb_tm;
+
+ pmd->tl_info.tm = pmd->tm;
+ pmd->tl_info.levels = 1;
+ pmd->tl_info.value_type.context = &pmd->bl_info;
+ pmd->tl_info.value_type.size = sizeof(__le64);
+ pmd->tl_info.value_type.inc = subtree_inc;
+ pmd->tl_info.value_type.dec = subtree_dec;
+ pmd->tl_info.value_type.equal = subtree_equal;
+
+ pmd->bl_info.tm = pmd->tm;
+ pmd->bl_info.levels = 1;
+ pmd->bl_info.value_type.context = pmd->data_sm;
+ pmd->bl_info.value_type.size = sizeof(__le64);
+ pmd->bl_info.value_type.inc = data_block_inc;
+ pmd->bl_info.value_type.dec = data_block_dec;
+ pmd->bl_info.value_type.equal = data_block_equal;
+
+ pmd->details_info.tm = pmd->tm;
+ pmd->details_info.levels = 1;
+ pmd->details_info.value_type.context = NULL;
+ pmd->details_info.value_type.size = sizeof(struct disk_device_details);
+ pmd->details_info.value_type.inc = NULL;
+ pmd->details_info.value_type.dec = NULL;
+ pmd->details_info.value_type.equal = NULL;
+}
+
+static int save_sm_roots(struct dm_pool_metadata *pmd)
+{
+ int r;
+ size_t len;
+
+ r = dm_sm_root_size(pmd->metadata_sm, &len);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_root_size(pmd->data_sm, &len);
+ if (r < 0)
+ return r;
+
+ return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
+}
+
+static void copy_sm_roots(struct dm_pool_metadata *pmd,
+ struct thin_disk_superblock *disk)
+{
+ memcpy(&disk->metadata_space_map_root,
+ &pmd->metadata_space_map_root,
+ sizeof(pmd->metadata_space_map_root));
+
+ memcpy(&disk->data_space_map_root,
+ &pmd->data_space_map_root,
+ sizeof(pmd->data_space_map_root));
+}
+
+static int __write_initial_superblock(struct dm_pool_metadata *pmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct thin_disk_superblock *disk_super;
+ sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
+
+ if (bdev_size > THIN_METADATA_MAX_SECTORS)
+ bdev_size = THIN_METADATA_MAX_SECTORS;
+
+ r = dm_sm_commit(pmd->data_sm);
+ if (r < 0)
+ return r;
+
+ r = save_sm_roots(pmd);
+ if (r < 0)
+ return r;
+
+ r = dm_tm_pre_commit(pmd->tm);
+ if (r < 0)
+ return r;
+
+ r = superblock_lock_zero(pmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ disk_super->flags = 0;
+ memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
+ disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
+ disk_super->version = cpu_to_le32(THIN_VERSION);
+ disk_super->time = 0;
+ disk_super->trans_id = 0;
+ disk_super->held_root = 0;
+
+ copy_sm_roots(pmd, disk_super);
+
+ disk_super->data_mapping_root = cpu_to_le64(pmd->root);
+ disk_super->device_details_root = cpu_to_le64(pmd->details_root);
+ disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
+ disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
+ disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
+
+ return dm_tm_commit(pmd->tm, sblock);
+}
+
+static int __format_metadata(struct dm_pool_metadata *pmd)
+{
+ int r;
+
+ r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ &pmd->tm, &pmd->metadata_sm);
+ if (r < 0) {
+ DMERR("tm_create_with_sm failed");
+ return r;
+ }
+
+ pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
+ if (IS_ERR(pmd->data_sm)) {
+ DMERR("sm_disk_create failed");
+ r = PTR_ERR(pmd->data_sm);
+ goto bad_cleanup_tm;
+ }
+
+ pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
+ if (!pmd->nb_tm) {
+ DMERR("could not create non-blocking clone tm");
+ r = -ENOMEM;
+ goto bad_cleanup_data_sm;
+ }
+
+ __setup_btree_details(pmd);
+
+ r = dm_btree_empty(&pmd->info, &pmd->root);
+ if (r < 0)
+ goto bad_cleanup_nb_tm;
+
+ r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
+ if (r < 0) {
+ DMERR("couldn't create devices root");
+ goto bad_cleanup_nb_tm;
+ }
+
+ r = __write_initial_superblock(pmd);
+ if (r)
+ goto bad_cleanup_nb_tm;
+
+ return 0;
+
+bad_cleanup_nb_tm:
+ dm_tm_destroy(pmd->nb_tm);
+bad_cleanup_data_sm:
+ dm_sm_destroy(pmd->data_sm);
+bad_cleanup_tm:
+ dm_tm_destroy(pmd->tm);
+ dm_sm_destroy(pmd->metadata_sm);
+
+ return r;
+}
+
+static int __check_incompat_features(struct thin_disk_superblock *disk_super,
+ struct dm_pool_metadata *pmd)
+{
+ uint32_t features;
+
+ features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
+ if (features) {
+ DMERR("could not access metadata due to unsupported optional features (%lx).",
+ (unsigned long)features);
+ return -EINVAL;
+ }
+
+ /*
+ * Check for read-only metadata to skip the following RDWR checks.
+ */
+ if (get_disk_ro(pmd->bdev->bd_disk))
+ return 0;
+
+ features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
+ if (features) {
+ DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
+ (unsigned long)features);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __open_metadata(struct dm_pool_metadata *pmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct thin_disk_superblock *disk_super;
+
+ r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ &sb_validator, &sblock);
+ if (r < 0) {
+ DMERR("couldn't read superblock");
+ return r;
+ }
+
+ disk_super = dm_block_data(sblock);
+
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk_super->data_block_size),
+ (unsigned long long)pmd->data_block_size);
+ r = -EINVAL;
+ goto bad_unlock_sblock;
+ }
+
+ r = __check_incompat_features(disk_super, pmd);
+ if (r < 0)
+ goto bad_unlock_sblock;
+
+ r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ disk_super->metadata_space_map_root,
+ sizeof(disk_super->metadata_space_map_root),
+ &pmd->tm, &pmd->metadata_sm);
+ if (r < 0) {
+ DMERR("tm_open_with_sm failed");
+ goto bad_unlock_sblock;
+ }
+
+ pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
+ sizeof(disk_super->data_space_map_root));
+ if (IS_ERR(pmd->data_sm)) {
+ DMERR("sm_disk_open failed");
+ r = PTR_ERR(pmd->data_sm);
+ goto bad_cleanup_tm;
+ }
+
+ pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
+ if (!pmd->nb_tm) {
+ DMERR("could not create non-blocking clone tm");
+ r = -ENOMEM;
+ goto bad_cleanup_data_sm;
+ }
+
+ __setup_btree_details(pmd);
+ return dm_bm_unlock(sblock);
+
+bad_cleanup_data_sm:
+ dm_sm_destroy(pmd->data_sm);
+bad_cleanup_tm:
+ dm_tm_destroy(pmd->tm);
+ dm_sm_destroy(pmd->metadata_sm);
+bad_unlock_sblock:
+ dm_bm_unlock(sblock);
+
+ return r;
+}
+
+static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
+{
+ int r, unformatted;
+
+ r = __superblock_all_zeroes(pmd->bm, &unformatted);
+ if (r)
+ return r;
+
+ if (unformatted)
+ return format_device ? __format_metadata(pmd) : -EPERM;
+
+ return __open_metadata(pmd);
+}
+
+static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
+{
+ int r;
+
+ pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
+ THIN_METADATA_CACHE_SIZE,
+ THIN_MAX_CONCURRENT_LOCKS);
+ if (IS_ERR(pmd->bm)) {
+ DMERR("could not create block manager");
+ return PTR_ERR(pmd->bm);
+ }
+
+ r = __open_or_format_metadata(pmd, format_device);
+ if (r)
+ dm_block_manager_destroy(pmd->bm);
+
+ return r;
+}
+
+static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
+{
+ dm_sm_destroy(pmd->data_sm);
+ dm_sm_destroy(pmd->metadata_sm);
+ dm_tm_destroy(pmd->nb_tm);
+ dm_tm_destroy(pmd->tm);
+ dm_block_manager_destroy(pmd->bm);
+}
+
+static int __begin_transaction(struct dm_pool_metadata *pmd)
+{
+ int r;
+ struct thin_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ /*
+ * We re-read the superblock every time. Shouldn't need to do this
+ * really.
+ */
+ r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ &sb_validator, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ pmd->time = le32_to_cpu(disk_super->time);
+ pmd->root = le64_to_cpu(disk_super->data_mapping_root);
+ pmd->details_root = le64_to_cpu(disk_super->device_details_root);
+ pmd->trans_id = le64_to_cpu(disk_super->trans_id);
+ pmd->flags = le32_to_cpu(disk_super->flags);
+ pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
+
+ dm_bm_unlock(sblock);
+ return 0;
+}
+
+static int __write_changed_details(struct dm_pool_metadata *pmd)
+{
+ int r;
+ struct dm_thin_device *td, *tmp;
+ struct disk_device_details details;
+ uint64_t key;
+
+ list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
+ if (!td->changed)
+ continue;
+
+ key = td->id;
+
+ details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
+ details.transaction_id = cpu_to_le64(td->transaction_id);
+ details.creation_time = cpu_to_le32(td->creation_time);
+ details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
+ __dm_bless_for_disk(&details);
+
+ r = dm_btree_insert(&pmd->details_info, pmd->details_root,
+ &key, &details, &pmd->details_root);
+ if (r)
+ return r;
+
+ if (td->open_count)
+ td->changed = 0;
+ else {
+ list_del(&td->list);
+ kfree(td);
+ }
+ }
+
+ return 0;
+}
+
+static int __commit_transaction(struct dm_pool_metadata *pmd)
+{
+ int r;
+ size_t metadata_len, data_len;
+ struct thin_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ /*
+ * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
+ */
+ BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
+
+ r = __write_changed_details(pmd);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_commit(pmd->data_sm);
+ if (r < 0)
+ return r;
+
+ r = dm_tm_pre_commit(pmd->tm);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_root_size(pmd->data_sm, &data_len);
+ if (r < 0)
+ return r;
+
+ r = save_sm_roots(pmd);
+ if (r < 0)
+ return r;
+
+ r = superblock_lock(pmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ disk_super->time = cpu_to_le32(pmd->time);
+ disk_super->data_mapping_root = cpu_to_le64(pmd->root);
+ disk_super->device_details_root = cpu_to_le64(pmd->details_root);
+ disk_super->trans_id = cpu_to_le64(pmd->trans_id);
+ disk_super->flags = cpu_to_le32(pmd->flags);
+
+ copy_sm_roots(pmd, disk_super);
+
+ return dm_tm_commit(pmd->tm, sblock);
+}
+
+struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool format_device)
+{
+ int r;
+ struct dm_pool_metadata *pmd;
+
+ pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
+ if (!pmd) {
+ DMERR("could not allocate metadata struct");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ init_rwsem(&pmd->root_lock);
+ pmd->time = 0;
+ INIT_LIST_HEAD(&pmd->thin_devices);
+ pmd->read_only = false;
+ pmd->fail_io = false;
+ pmd->bdev = bdev;
+ pmd->data_block_size = data_block_size;
+
+ r = __create_persistent_data_objects(pmd, format_device);
+ if (r) {
+ kfree(pmd);
+ return ERR_PTR(r);
+ }
+
+ r = __begin_transaction(pmd);
+ if (r < 0) {
+ if (dm_pool_metadata_close(pmd) < 0)
+ DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
+ return ERR_PTR(r);
+ }
+
+ return pmd;
+}
+
+int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
+{
+ int r;
+ unsigned open_devices = 0;
+ struct dm_thin_device *td, *tmp;
+
+ down_read(&pmd->root_lock);
+ list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
+ if (td->open_count)
+ open_devices++;
+ else {
+ list_del(&td->list);
+ kfree(td);
+ }
+ }
+ up_read(&pmd->root_lock);
+
+ if (open_devices) {
+ DMERR("attempt to close pmd when %u device(s) are still open",
+ open_devices);
+ return -EBUSY;
+ }
+
+ if (!pmd->read_only && !pmd->fail_io) {
+ r = __commit_transaction(pmd);
+ if (r < 0)
+ DMWARN("%s: __commit_transaction() failed, error = %d",
+ __func__, r);
+ }
+
+ if (!pmd->fail_io)
+ __destroy_persistent_data_objects(pmd);
+
+ kfree(pmd);
+ return 0;
+}
+
+/*
+ * __open_device: Returns @td corresponding to device with id @dev,
+ * creating it if @create is set and incrementing @td->open_count.
+ * On failure, @td is undefined.
+ */
+static int __open_device(struct dm_pool_metadata *pmd,
+ dm_thin_id dev, int create,
+ struct dm_thin_device **td)
+{
+ int r, changed = 0;
+ struct dm_thin_device *td2;
+ uint64_t key = dev;
+ struct disk_device_details details_le;
+
+ /*
+ * If the device is already open, return it.
+ */
+ list_for_each_entry(td2, &pmd->thin_devices, list)
+ if (td2->id == dev) {
+ /*
+ * May not create an already-open device.
+ */
+ if (create)
+ return -EEXIST;
+
+ td2->open_count++;
+ *td = td2;
+ return 0;
+ }
+
+ /*
+ * Check the device exists.
+ */
+ r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
+ &key, &details_le);
+ if (r) {
+ if (r != -ENODATA || !create)
+ return r;
+
+ /*
+ * Create new device.
+ */
+ changed = 1;
+ details_le.mapped_blocks = 0;
+ details_le.transaction_id = cpu_to_le64(pmd->trans_id);
+ details_le.creation_time = cpu_to_le32(pmd->time);
+ details_le.snapshotted_time = cpu_to_le32(pmd->time);
+ }
+
+ *td = kmalloc(sizeof(**td), GFP_NOIO);
+ if (!*td)
+ return -ENOMEM;
+
+ (*td)->pmd = pmd;
+ (*td)->id = dev;
+ (*td)->open_count = 1;
+ (*td)->changed = changed;
+ (*td)->aborted_with_changes = false;
+ (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
+ (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
+ (*td)->creation_time = le32_to_cpu(details_le.creation_time);
+ (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
+
+ list_add(&(*td)->list, &pmd->thin_devices);
+
+ return 0;
+}
+
+static void __close_device(struct dm_thin_device *td)
+{
+ --td->open_count;
+}
+
+static int __create_thin(struct dm_pool_metadata *pmd,
+ dm_thin_id dev)
+{
+ int r;
+ dm_block_t dev_root;
+ uint64_t key = dev;
+ struct disk_device_details details_le;
+ struct dm_thin_device *td;
+ __le64 value;
+
+ r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
+ &key, &details_le);
+ if (!r)
+ return -EEXIST;
+
+ /*
+ * Create an empty btree for the mappings.
+ */
+ r = dm_btree_empty(&pmd->bl_info, &dev_root);
+ if (r)
+ return r;
+
+ /*
+ * Insert it into the main mapping tree.
+ */
+ value = cpu_to_le64(dev_root);
+ __dm_bless_for_disk(&value);
+ r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
+ if (r) {
+ dm_btree_del(&pmd->bl_info, dev_root);
+ return r;
+ }
+
+ r = __open_device(pmd, dev, 1, &td);
+ if (r) {
+ dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
+ dm_btree_del(&pmd->bl_info, dev_root);
+ return r;
+ }
+ __close_device(td);
+
+ return r;
+}
+
+int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __create_thin(pmd, dev);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+static int __set_snapshot_details(struct dm_pool_metadata *pmd,
+ struct dm_thin_device *snap,
+ dm_thin_id origin, uint32_t time)
+{
+ int r;
+ struct dm_thin_device *td;
+
+ r = __open_device(pmd, origin, 0, &td);
+ if (r)
+ return r;
+
+ td->changed = 1;
+ td->snapshotted_time = time;
+
+ snap->mapped_blocks = td->mapped_blocks;
+ snap->snapshotted_time = time;
+ __close_device(td);
+
+ return 0;
+}
+
+static int __create_snap(struct dm_pool_metadata *pmd,
+ dm_thin_id dev, dm_thin_id origin)
+{
+ int r;
+ dm_block_t origin_root;
+ uint64_t key = origin, dev_key = dev;
+ struct dm_thin_device *td;
+ struct disk_device_details details_le;
+ __le64 value;
+
+ /* check this device is unused */
+ r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
+ &dev_key, &details_le);
+ if (!r)
+ return -EEXIST;
+
+ /* find the mapping tree for the origin */
+ r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
+ if (r)
+ return r;
+ origin_root = le64_to_cpu(value);
+
+ /* clone the origin, an inc will do */
+ dm_tm_inc(pmd->tm, origin_root);
+
+ /* insert into the main mapping tree */
+ value = cpu_to_le64(origin_root);
+ __dm_bless_for_disk(&value);
+ key = dev;
+ r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
+ if (r) {
+ dm_tm_dec(pmd->tm, origin_root);
+ return r;
+ }
+
+ pmd->time++;
+
+ r = __open_device(pmd, dev, 1, &td);
+ if (r)
+ goto bad;
+
+ r = __set_snapshot_details(pmd, td, origin, pmd->time);
+ __close_device(td);
+
+ if (r)
+ goto bad;
+
+ return 0;
+
+bad:
+ dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
+ dm_btree_remove(&pmd->details_info, pmd->details_root,
+ &key, &pmd->details_root);
+ return r;
+}
+
+int dm_pool_create_snap(struct dm_pool_metadata *pmd,
+ dm_thin_id dev,
+ dm_thin_id origin)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __create_snap(pmd, dev, origin);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
+{
+ int r;
+ uint64_t key = dev;
+ struct dm_thin_device *td;
+
+ /* TODO: failure should mark the transaction invalid */
+ r = __open_device(pmd, dev, 0, &td);
+ if (r)
+ return r;
+
+ if (td->open_count > 1) {
+ __close_device(td);
+ return -EBUSY;
+ }
+
+ list_del(&td->list);
+ kfree(td);
+ r = dm_btree_remove(&pmd->details_info, pmd->details_root,
+ &key, &pmd->details_root);
+ if (r)
+ return r;
+
+ r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
+ if (r)
+ return r;
+
+ return 0;
+}
+
+int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
+ dm_thin_id dev)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __delete_device(pmd, dev);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
+ uint64_t current_id,
+ uint64_t new_id)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+
+ if (pmd->fail_io)
+ goto out;
+
+ if (pmd->trans_id != current_id) {
+ DMERR("mismatched transaction id");
+ goto out;
+ }
+
+ pmd->trans_id = new_id;
+ r = 0;
+
+out:
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
+ uint64_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io) {
+ *result = pmd->trans_id;
+ r = 0;
+ }
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
+{
+ int r, inc;
+ struct thin_disk_superblock *disk_super;
+ struct dm_block *copy, *sblock;
+ dm_block_t held_root;
+
+ /*
+ * Copy the superblock.
+ */
+ dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
+ r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
+ &sb_validator, &copy, &inc);
+ if (r)
+ return r;
+
+ BUG_ON(!inc);
+
+ held_root = dm_block_location(copy);
+ disk_super = dm_block_data(copy);
+
+ if (le64_to_cpu(disk_super->held_root)) {
+ DMWARN("Pool metadata snapshot already exists: release this before taking another.");
+
+ dm_tm_dec(pmd->tm, held_root);
+ dm_tm_unlock(pmd->tm, copy);
+ return -EBUSY;
+ }
+
+ /*
+ * Wipe the spacemap since we're not publishing this.
+ */
+ memset(&disk_super->data_space_map_root, 0,
+ sizeof(disk_super->data_space_map_root));
+ memset(&disk_super->metadata_space_map_root, 0,
+ sizeof(disk_super->metadata_space_map_root));
+
+ /*
+ * Increment the data structures that need to be preserved.
+ */
+ dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
+ dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
+ dm_tm_unlock(pmd->tm, copy);
+
+ /*
+ * Write the held root into the superblock.
+ */
+ r = superblock_lock(pmd, &sblock);
+ if (r) {
+ dm_tm_dec(pmd->tm, held_root);
+ return r;
+ }
+
+ disk_super = dm_block_data(sblock);
+ disk_super->held_root = cpu_to_le64(held_root);
+ dm_bm_unlock(sblock);
+ return 0;
+}
+
+int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __reserve_metadata_snap(pmd);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+static int __release_metadata_snap(struct dm_pool_metadata *pmd)
+{
+ int r;
+ struct thin_disk_superblock *disk_super;
+ struct dm_block *sblock, *copy;
+ dm_block_t held_root;
+
+ r = superblock_lock(pmd, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ held_root = le64_to_cpu(disk_super->held_root);
+ disk_super->held_root = cpu_to_le64(0);
+
+ dm_bm_unlock(sblock);
+
+ if (!held_root) {
+ DMWARN("No pool metadata snapshot found: nothing to release.");
+ return -EINVAL;
+ }
+
+ r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(copy);
+ dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
+ dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
+ dm_sm_dec_block(pmd->metadata_sm, held_root);
+
+ return dm_tm_unlock(pmd->tm, copy);
+}
+
+int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __release_metadata_snap(pmd);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+static int __get_metadata_snap(struct dm_pool_metadata *pmd,
+ dm_block_t *result)
+{
+ int r;
+ struct thin_disk_superblock *disk_super;
+ struct dm_block *sblock;
+
+ r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+ &sb_validator, &sblock);
+ if (r)
+ return r;
+
+ disk_super = dm_block_data(sblock);
+ *result = le64_to_cpu(disk_super->held_root);
+
+ return dm_bm_unlock(sblock);
+}
+
+int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __get_metadata_snap(pmd, result);
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
+ struct dm_thin_device **td)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __open_device(pmd, dev, 0, td);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_close_thin_device(struct dm_thin_device *td)
+{
+ down_write(&td->pmd->root_lock);
+ __close_device(td);
+ up_write(&td->pmd->root_lock);
+
+ return 0;
+}
+
+dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
+{
+ return td->id;
+}
+
+/*
+ * Check whether @time (of block creation) is older than @td's last snapshot.
+ * If so then the associated block is shared with the last snapshot device.
+ * Any block on a device created *after* the device last got snapshotted is
+ * necessarily not shared.
+ */
+static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
+{
+ return td->snapshotted_time > time;
+}
+
+int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
+ int can_issue_io, struct dm_thin_lookup_result *result)
+{
+ int r;
+ __le64 value;
+ struct dm_pool_metadata *pmd = td->pmd;
+ dm_block_t keys[2] = { td->id, block };
+ struct dm_btree_info *info;
+
+ if (pmd->fail_io)
+ return -EINVAL;
+
+ down_read(&pmd->root_lock);
+
+ if (can_issue_io) {
+ info = &pmd->info;
+ } else
+ info = &pmd->nb_info;
+
+ r = dm_btree_lookup(info, pmd->root, keys, &value);
+ if (!r) {
+ uint64_t block_time = 0;
+ dm_block_t exception_block;
+ uint32_t exception_time;
+
+ block_time = le64_to_cpu(value);
+ unpack_block_time(block_time, &exception_block,
+ &exception_time);
+ result->block = exception_block;
+ result->shared = __snapshotted_since(td, exception_time);
+ }
+
+ up_read(&pmd->root_lock);
+ return r;
+}
+
+static int __insert(struct dm_thin_device *td, dm_block_t block,
+ dm_block_t data_block)
+{
+ int r, inserted;
+ __le64 value;
+ struct dm_pool_metadata *pmd = td->pmd;
+ dm_block_t keys[2] = { td->id, block };
+
+ value = cpu_to_le64(pack_block_time(data_block, pmd->time));
+ __dm_bless_for_disk(&value);
+
+ r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
+ &pmd->root, &inserted);
+ if (r)
+ return r;
+
+ td->changed = 1;
+ if (inserted)
+ td->mapped_blocks++;
+
+ return 0;
+}
+
+int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
+ dm_block_t data_block)
+{
+ int r = -EINVAL;
+
+ down_write(&td->pmd->root_lock);
+ if (!td->pmd->fail_io)
+ r = __insert(td, block, data_block);
+ up_write(&td->pmd->root_lock);
+
+ return r;
+}
+
+static int __remove(struct dm_thin_device *td, dm_block_t block)
+{
+ int r;
+ struct dm_pool_metadata *pmd = td->pmd;
+ dm_block_t keys[2] = { td->id, block };
+
+ r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
+ if (r)
+ return r;
+
+ td->mapped_blocks--;
+ td->changed = 1;
+
+ return 0;
+}
+
+int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
+{
+ int r = -EINVAL;
+
+ down_write(&td->pmd->root_lock);
+ if (!td->pmd->fail_io)
+ r = __remove(td, block);
+ up_write(&td->pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
+{
+ int r;
+ uint32_t ref_count;
+
+ down_read(&pmd->root_lock);
+ r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
+ if (!r)
+ *result = (ref_count != 0);
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
+{
+ int r;
+
+ down_read(&td->pmd->root_lock);
+ r = td->changed;
+ up_read(&td->pmd->root_lock);
+
+ return r;
+}
+
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
+{
+ bool r = false;
+ struct dm_thin_device *td, *tmp;
+
+ down_read(&pmd->root_lock);
+ list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
+ if (td->changed) {
+ r = td->changed;
+ break;
+ }
+ }
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+bool dm_thin_aborted_changes(struct dm_thin_device *td)
+{
+ bool r;
+
+ down_read(&td->pmd->root_lock);
+ r = td->aborted_with_changes;
+ up_read(&td->pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = dm_sm_new_block(pmd->data_sm, result);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (pmd->fail_io)
+ goto out;
+
+ r = __commit_transaction(pmd);
+ if (r <= 0)
+ goto out;
+
+ /*
+ * Open the next transaction.
+ */
+ r = __begin_transaction(pmd);
+out:
+ up_write(&pmd->root_lock);
+ return r;
+}
+
+static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
+{
+ struct dm_thin_device *td;
+
+ list_for_each_entry(td, &pmd->thin_devices, list)
+ td->aborted_with_changes = td->changed;
+}
+
+int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (pmd->fail_io)
+ goto out;
+
+ __set_abort_with_changes_flags(pmd);
+ __destroy_persistent_data_objects(pmd);
+ r = __create_persistent_data_objects(pmd, false);
+ if (r)
+ pmd->fail_io = true;
+
+out:
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = dm_sm_get_nr_free(pmd->data_sm, result);
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = dm_sm_get_nr_free(pmd->metadata_sm, result);
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = dm_sm_get_nr_blocks(pmd->data_sm, result);
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
+{
+ int r = -EINVAL;
+ struct dm_pool_metadata *pmd = td->pmd;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io) {
+ *result = td->mapped_blocks;
+ r = 0;
+ }
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
+{
+ int r;
+ __le64 value_le;
+ dm_block_t thin_root;
+ struct dm_pool_metadata *pmd = td->pmd;
+
+ r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
+ if (r)
+ return r;
+
+ thin_root = le64_to_cpu(value_le);
+
+ return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
+}
+
+int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+ struct dm_pool_metadata *pmd = td->pmd;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __highest_block(td, result);
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
+static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
+{
+ int r;
+ dm_block_t old_count;
+
+ r = dm_sm_get_nr_blocks(sm, &old_count);
+ if (r)
+ return r;
+
+ if (new_count == old_count)
+ return 0;
+
+ if (new_count < old_count) {
+ DMERR("cannot reduce size of space map");
+ return -EINVAL;
+ }
+
+ return dm_sm_extend(sm, new_count - old_count);
+}
+
+int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __resize_space_map(pmd->data_sm, new_count);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __resize_space_map(pmd->metadata_sm, new_count);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
+{
+ down_write(&pmd->root_lock);
+ pmd->read_only = true;
+ dm_bm_set_read_only(pmd->bm);
+ up_write(&pmd->root_lock);
+}
+
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
+{
+ down_write(&pmd->root_lock);
+ pmd->read_only = false;
+ dm_bm_set_read_write(pmd->bm);
+ up_write(&pmd->root_lock);
+}
+
+int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context)
+{
+ int r;
+
+ down_write(&pmd->root_lock);
+ r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct thin_disk_superblock *disk_super;
+
+ down_write(&pmd->root_lock);
+ pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
+
+ r = superblock_lock(pmd, &sblock);
+ if (r) {
+ DMERR("couldn't read superblock");
+ goto out;
+ }
+
+ disk_super = dm_block_data(sblock);
+ disk_super->flags = cpu_to_le32(pmd->flags);
+
+ dm_bm_unlock(sblock);
+out:
+ up_write(&pmd->root_lock);
+ return r;
+}
+
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
+{
+ bool needs_check;
+
+ down_read(&pmd->root_lock);
+ needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
+ up_read(&pmd->root_lock);
+
+ return needs_check;
+}
+
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
+{
+ dm_tm_issue_prefetches(pmd->tm);
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
new file mode 100644
index 000000000..fac01a96d
--- /dev/null
+++ b/drivers/md/dm-thin-metadata.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2010-2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_THIN_METADATA_H
+#define DM_THIN_METADATA_H
+
+#include "persistent-data/dm-block-manager.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-metadata.h"
+
+#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
+
+/*
+ * The metadata device is currently limited in size.
+ */
+#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
+
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Thin metadata superblock flags.
+ */
+#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
+
+struct dm_pool_metadata;
+struct dm_thin_device;
+
+/*
+ * Device identifier
+ */
+typedef uint64_t dm_thin_id;
+
+/*
+ * Reopens or creates a new, empty metadata volume.
+ */
+struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
+ sector_t data_block_size,
+ bool format_device);
+
+int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
+
+/*
+ * Compat feature flags. Any incompat flags beyond the ones
+ * specified below will prevent use of the thin metadata.
+ */
+#define THIN_FEATURE_COMPAT_SUPP 0UL
+#define THIN_FEATURE_COMPAT_RO_SUPP 0UL
+#define THIN_FEATURE_INCOMPAT_SUPP 0UL
+
+/*
+ * Device creation/deletion.
+ */
+int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev);
+
+/*
+ * An internal snapshot.
+ *
+ * You can only snapshot a quiesced origin i.e. one that is either
+ * suspended or not instanced at all.
+ */
+int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev,
+ dm_thin_id origin);
+
+/*
+ * Deletes a virtual device from the metadata. It _is_ safe to call this
+ * when that device is open. Operations on that device will just start
+ * failing. You still need to call close() on the device.
+ */
+int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
+ dm_thin_id dev);
+
+/*
+ * Commits _all_ metadata changes: device creation, deletion, mapping
+ * updates.
+ */
+int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
+
+/*
+ * Discards all uncommitted changes. Rereads the superblock, rolling back
+ * to the last good transaction. Thin devices remain open.
+ * dm_thin_aborted_changes() tells you if they had uncommitted changes.
+ *
+ * If this call fails it's only useful to call dm_pool_metadata_close().
+ * All other methods will fail with -EINVAL.
+ */
+int dm_pool_abort_metadata(struct dm_pool_metadata *pmd);
+
+/*
+ * Set/get userspace transaction id.
+ */
+int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
+ uint64_t current_id,
+ uint64_t new_id);
+
+int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
+ uint64_t *result);
+
+/*
+ * Hold/get root for userspace transaction.
+ *
+ * The metadata snapshot is a copy of the current superblock (minus the
+ * space maps). Userland can access the data structures for READ
+ * operations only. A small performance hit is incurred by providing this
+ * copy of the metadata to userland due to extra copy-on-write operations
+ * on the metadata nodes. Release this as soon as you finish with it.
+ */
+int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd);
+int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd);
+
+int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
+ dm_block_t *result);
+
+/*
+ * Actions on a single virtual device.
+ */
+
+/*
+ * Opening the same device more than once will fail with -EBUSY.
+ */
+int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
+ struct dm_thin_device **td);
+
+int dm_pool_close_thin_device(struct dm_thin_device *td);
+
+dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
+
+struct dm_thin_lookup_result {
+ dm_block_t block;
+ bool shared:1;
+};
+
+/*
+ * Returns:
+ * -EWOULDBLOCK iff @can_issue_io is set and would issue IO
+ * -ENODATA iff that mapping is not present.
+ * 0 success
+ */
+int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
+ int can_issue_io, struct dm_thin_lookup_result *result);
+
+/*
+ * Obtain an unused block.
+ */
+int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result);
+
+/*
+ * Insert or remove block.
+ */
+int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
+ dm_block_t data_block);
+
+int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
+
+/*
+ * Queries.
+ */
+bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
+
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
+
+bool dm_thin_aborted_changes(struct dm_thin_device *td);
+
+int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
+ dm_block_t *highest_mapped);
+
+int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result);
+
+int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd,
+ dm_block_t *result);
+
+int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
+ dm_block_t *result);
+
+int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
+ dm_block_t *result);
+
+int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
+
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
+
+/*
+ * Returns -ENOSPC if the new size is too small and already allocated
+ * blocks would be lost.
+ */
+int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
+int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
+
+/*
+ * Flicks the underlying block manager into read only mode, so you know
+ * that nothing is changing.
+ */
+void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd);
+
+int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context);
+
+/*
+ * Updates the superblock immediately.
+ */
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
+
+/*
+ * Issue any prefetches that may be useful.
+ */
+void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
+
+/*----------------------------------------------------------------*/
+
+#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
new file mode 100644
index 000000000..e22e6c892
--- /dev/null
+++ b/drivers/md/dm-thin.c
@@ -0,0 +1,4107 @@
+/*
+ * Copyright (C) 2011-2012 Red Hat UK.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-thin-metadata.h"
+#include "dm-bio-prison.h"
+#include "dm.h"
+
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/jiffies.h>
+#include <linux/log2.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <linux/rbtree.h>
+
+#define DM_MSG_PREFIX "thin"
+
+/*
+ * Tunable constants
+ */
+#define ENDIO_HOOK_POOL_SIZE 1024
+#define MAPPING_POOL_SIZE 1024
+#define COMMIT_PERIOD HZ
+#define NO_SPACE_TIMEOUT_SECS 60
+
+static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
+ "A percentage of time allocated for copy on write");
+
+/*
+ * The block size of the device holding pool data must be
+ * between 64KB and 1GB.
+ */
+#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
+#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
+
+/*
+ * Device id is restricted to 24 bits.
+ */
+#define MAX_DEV_ID ((1 << 24) - 1)
+
+/*
+ * How do we handle breaking sharing of data blocks?
+ * =================================================
+ *
+ * We use a standard copy-on-write btree to store the mappings for the
+ * devices (note I'm talking about copy-on-write of the metadata here, not
+ * the data). When you take an internal snapshot you clone the root node
+ * of the origin btree. After this there is no concept of an origin or a
+ * snapshot. They are just two device trees that happen to point to the
+ * same data blocks.
+ *
+ * When we get a write in we decide if it's to a shared data block using
+ * some timestamp magic. If it is, we have to break sharing.
+ *
+ * Let's say we write to a shared block in what was the origin. The
+ * steps are:
+ *
+ * i) plug io further to this physical block. (see bio_prison code).
+ *
+ * ii) quiesce any read io to that shared data block. Obviously
+ * including all devices that share this block. (see dm_deferred_set code)
+ *
+ * iii) copy the data block to a newly allocate block. This step can be
+ * missed out if the io covers the block. (schedule_copy).
+ *
+ * iv) insert the new mapping into the origin's btree
+ * (process_prepared_mapping). This act of inserting breaks some
+ * sharing of btree nodes between the two devices. Breaking sharing only
+ * effects the btree of that specific device. Btrees for the other
+ * devices that share the block never change. The btree for the origin
+ * device as it was after the last commit is untouched, ie. we're using
+ * persistent data structures in the functional programming sense.
+ *
+ * v) unplug io to this physical block, including the io that triggered
+ * the breaking of sharing.
+ *
+ * Steps (ii) and (iii) occur in parallel.
+ *
+ * The metadata _doesn't_ need to be committed before the io continues. We
+ * get away with this because the io is always written to a _new_ block.
+ * If there's a crash, then:
+ *
+ * - The origin mapping will point to the old origin block (the shared
+ * one). This will contain the data as it was before the io that triggered
+ * the breaking of sharing came in.
+ *
+ * - The snap mapping still points to the old block. As it would after
+ * the commit.
+ *
+ * The downside of this scheme is the timestamp magic isn't perfect, and
+ * will continue to think that data block in the snapshot device is shared
+ * even after the write to the origin has broken sharing. I suspect data
+ * blocks will typically be shared by many different devices, so we're
+ * breaking sharing n + 1 times, rather than n, where n is the number of
+ * devices that reference this data block. At the moment I think the
+ * benefits far, far outweigh the disadvantages.
+ */
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Key building.
+ */
+static void build_data_key(struct dm_thin_device *td,
+ dm_block_t b, struct dm_cell_key *key)
+{
+ key->virtual = 0;
+ key->dev = dm_thin_dev_id(td);
+ key->block_begin = b;
+ key->block_end = b + 1ULL;
+}
+
+static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
+ struct dm_cell_key *key)
+{
+ key->virtual = 1;
+ key->dev = dm_thin_dev_id(td);
+ key->block_begin = b;
+ key->block_end = b + 1ULL;
+}
+
+/*----------------------------------------------------------------*/
+
+#define THROTTLE_THRESHOLD (1 * HZ)
+
+struct throttle {
+ struct rw_semaphore lock;
+ unsigned long threshold;
+ bool throttle_applied;
+};
+
+static void throttle_init(struct throttle *t)
+{
+ init_rwsem(&t->lock);
+ t->throttle_applied = false;
+}
+
+static void throttle_work_start(struct throttle *t)
+{
+ t->threshold = jiffies + THROTTLE_THRESHOLD;
+}
+
+static void throttle_work_update(struct throttle *t)
+{
+ if (!t->throttle_applied && jiffies > t->threshold) {
+ down_write(&t->lock);
+ t->throttle_applied = true;
+ }
+}
+
+static void throttle_work_complete(struct throttle *t)
+{
+ if (t->throttle_applied) {
+ t->throttle_applied = false;
+ up_write(&t->lock);
+ }
+}
+
+static void throttle_lock(struct throttle *t)
+{
+ down_read(&t->lock);
+}
+
+static void throttle_unlock(struct throttle *t)
+{
+ up_read(&t->lock);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * A pool device ties together a metadata device and a data device. It
+ * also provides the interface for creating and destroying internal
+ * devices.
+ */
+struct dm_thin_new_mapping;
+
+/*
+ * The pool runs in 4 modes. Ordered in degraded order for comparisons.
+ */
+enum pool_mode {
+ PM_WRITE, /* metadata may be changed */
+ PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
+ PM_READ_ONLY, /* metadata may not be changed */
+ PM_FAIL, /* all I/O fails */
+};
+
+struct pool_features {
+ enum pool_mode mode;
+
+ bool zero_new_blocks:1;
+ bool discard_enabled:1;
+ bool discard_passdown:1;
+ bool error_if_no_space:1;
+};
+
+struct thin_c;
+typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
+typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
+typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
+
+#define CELL_SORT_ARRAY_SIZE 8192
+
+struct pool {
+ struct list_head list;
+ struct dm_target *ti; /* Only set if a pool target is bound */
+
+ struct mapped_device *pool_md;
+ struct block_device *md_dev;
+ struct dm_pool_metadata *pmd;
+
+ dm_block_t low_water_blocks;
+ uint32_t sectors_per_block;
+ int sectors_per_block_shift;
+
+ struct pool_features pf;
+ bool low_water_triggered:1; /* A dm event has been sent */
+ bool suspended:1;
+
+ struct dm_bio_prison *prison;
+ struct dm_kcopyd_client *copier;
+
+ struct workqueue_struct *wq;
+ struct throttle throttle;
+ struct work_struct worker;
+ struct delayed_work waker;
+ struct delayed_work no_space_timeout;
+
+ unsigned long last_commit_jiffies;
+ unsigned ref_count;
+
+ spinlock_t lock;
+ struct bio_list deferred_flush_bios;
+ struct list_head prepared_mappings;
+ struct list_head prepared_discards;
+ struct list_head active_thins;
+
+ struct dm_deferred_set *shared_read_ds;
+ struct dm_deferred_set *all_io_ds;
+
+ struct dm_thin_new_mapping *next_mapping;
+ mempool_t *mapping_pool;
+
+ process_bio_fn process_bio;
+ process_bio_fn process_discard;
+
+ process_cell_fn process_cell;
+ process_cell_fn process_discard_cell;
+
+ process_mapping_fn process_prepared_mapping;
+ process_mapping_fn process_prepared_discard;
+
+ struct dm_bio_prison_cell **cell_sort_array;
+};
+
+static enum pool_mode get_pool_mode(struct pool *pool);
+static void metadata_operation_failed(struct pool *pool, const char *op, int r);
+
+/*
+ * Target context for a pool.
+ */
+struct pool_c {
+ struct dm_target *ti;
+ struct pool *pool;
+ struct dm_dev *data_dev;
+ struct dm_dev *metadata_dev;
+ struct dm_target_callbacks callbacks;
+
+ dm_block_t low_water_blocks;
+ struct pool_features requested_pf; /* Features requested during table load */
+ struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
+};
+
+/*
+ * Target context for a thin.
+ */
+struct thin_c {
+ struct list_head list;
+ struct dm_dev *pool_dev;
+ struct dm_dev *origin_dev;
+ sector_t origin_size;
+ dm_thin_id dev_id;
+
+ struct pool *pool;
+ struct dm_thin_device *td;
+ struct mapped_device *thin_md;
+
+ bool requeue_mode:1;
+ spinlock_t lock;
+ struct list_head deferred_cells;
+ struct bio_list deferred_bio_list;
+ struct bio_list retry_on_resume_list;
+ struct rb_root sort_bio_list; /* sorted list of deferred bios */
+
+ /*
+ * Ensures the thin is not destroyed until the worker has finished
+ * iterating the active_thins list.
+ */
+ atomic_t refcount;
+ struct completion can_destroy;
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * wake_worker() is used when new work is queued and when pool_resume is
+ * ready to continue deferred IO processing.
+ */
+static void wake_worker(struct pool *pool)
+{
+ queue_work(pool->wq, &pool->worker);
+}
+
+/*----------------------------------------------------------------*/
+
+static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
+ struct dm_bio_prison_cell **cell_result)
+{
+ int r;
+ struct dm_bio_prison_cell *cell_prealloc;
+
+ /*
+ * Allocate a cell from the prison's mempool.
+ * This might block but it can't fail.
+ */
+ cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
+
+ r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
+ if (r)
+ /*
+ * We reused an old cell; we can get rid of
+ * the new one.
+ */
+ dm_bio_prison_free_cell(pool->prison, cell_prealloc);
+
+ return r;
+}
+
+static void cell_release(struct pool *pool,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *bios)
+{
+ dm_cell_release(pool->prison, cell, bios);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_visit_release(struct pool *pool,
+ void (*fn)(void *, struct dm_bio_prison_cell *),
+ void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ dm_cell_visit_release(pool->prison, fn, context, cell);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_release_no_holder(struct pool *pool,
+ struct dm_bio_prison_cell *cell,
+ struct bio_list *bios)
+{
+ dm_cell_release_no_holder(pool->prison, cell, bios);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_error_with_code(struct pool *pool,
+ struct dm_bio_prison_cell *cell, int error_code)
+{
+ dm_cell_error(pool->prison, cell, error_code);
+ dm_bio_prison_free_cell(pool->prison, cell);
+}
+
+static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(pool, cell, -EIO);
+}
+
+static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(pool, cell, 0);
+}
+
+static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * A global list of pools that uses a struct mapped_device as a key.
+ */
+static struct dm_thin_pool_table {
+ struct mutex mutex;
+ struct list_head pools;
+} dm_thin_pool_table;
+
+static void pool_table_init(void)
+{
+ mutex_init(&dm_thin_pool_table.mutex);
+ INIT_LIST_HEAD(&dm_thin_pool_table.pools);
+}
+
+static void __pool_table_insert(struct pool *pool)
+{
+ BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+ list_add(&pool->list, &dm_thin_pool_table.pools);
+}
+
+static void __pool_table_remove(struct pool *pool)
+{
+ BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+ list_del(&pool->list);
+}
+
+static struct pool *__pool_table_lookup(struct mapped_device *md)
+{
+ struct pool *pool = NULL, *tmp;
+
+ BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+
+ list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
+ if (tmp->pool_md == md) {
+ pool = tmp;
+ break;
+ }
+ }
+
+ return pool;
+}
+
+static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
+{
+ struct pool *pool = NULL, *tmp;
+
+ BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+
+ list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
+ if (tmp->md_dev == md_dev) {
+ pool = tmp;
+ break;
+ }
+ }
+
+ return pool;
+}
+
+/*----------------------------------------------------------------*/
+
+struct dm_thin_endio_hook {
+ struct thin_c *tc;
+ struct dm_deferred_entry *shared_read_entry;
+ struct dm_deferred_entry *all_io_entry;
+ struct dm_thin_new_mapping *overwrite_mapping;
+ struct rb_node rb_node;
+};
+
+static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
+{
+ bio_list_merge(bios, master);
+ bio_list_init(master);
+}
+
+static void error_bio_list(struct bio_list *bios, int error)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(bios)))
+ bio_endio(bio, error);
+}
+
+static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
+{
+ struct bio_list bios;
+ unsigned long flags;
+
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ __merge_bio_list(&bios, master);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ error_bio_list(&bios, error);
+}
+
+static void requeue_deferred_cells(struct thin_c *tc)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+ struct list_head cells;
+ struct dm_bio_prison_cell *cell, *tmp;
+
+ INIT_LIST_HEAD(&cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice_init(&tc->deferred_cells, &cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ list_for_each_entry_safe(cell, tmp, &cells, user_list)
+ cell_requeue(pool, cell);
+}
+
+static void requeue_io(struct thin_c *tc)
+{
+ struct bio_list bios;
+ unsigned long flags;
+
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ __merge_bio_list(&bios, &tc->deferred_bio_list);
+ __merge_bio_list(&bios, &tc->retry_on_resume_list);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ error_bio_list(&bios, DM_ENDIO_REQUEUE);
+ requeue_deferred_cells(tc);
+}
+
+static void error_retry_list(struct pool *pool)
+{
+ struct thin_c *tc;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tc, &pool->active_thins, list)
+ error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
+ rcu_read_unlock();
+}
+
+/*
+ * This section of code contains the logic for processing a thin device's IO.
+ * Much of the code depends on pool object resources (lists, workqueues, etc)
+ * but most is exclusively called from the thin target rather than the thin-pool
+ * target.
+ */
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+ return pool->sectors_per_block_shift >= 0;
+}
+
+static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ sector_t block_nr = bio->bi_iter.bi_sector;
+
+ if (block_size_is_power_of_two(pool))
+ block_nr >>= pool->sectors_per_block_shift;
+ else
+ (void) sector_div(block_nr, pool->sectors_per_block);
+
+ return block_nr;
+}
+
+static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
+{
+ struct pool *pool = tc->pool;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
+
+ bio->bi_bdev = tc->pool_dev->bdev;
+ if (block_size_is_power_of_two(pool))
+ bio->bi_iter.bi_sector =
+ (block << pool->sectors_per_block_shift) |
+ (bi_sector & (pool->sectors_per_block - 1));
+ else
+ bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
+ sector_div(bi_sector, pool->sectors_per_block);
+}
+
+static void remap_to_origin(struct thin_c *tc, struct bio *bio)
+{
+ bio->bi_bdev = tc->origin_dev->bdev;
+}
+
+static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
+{
+ return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+ dm_thin_changed_this_transaction(tc->td);
+}
+
+static void inc_all_io_entry(struct pool *pool, struct bio *bio)
+{
+ struct dm_thin_endio_hook *h;
+
+ if (bio->bi_rw & REQ_DISCARD)
+ return;
+
+ h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+ h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
+}
+
+static void issue(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+
+ if (!bio_triggers_commit(tc, bio)) {
+ generic_make_request(bio);
+ return;
+ }
+
+ /*
+ * Complete bio with an error if earlier I/O caused changes to
+ * the metadata that can't be committed e.g, due to I/O errors
+ * on the metadata device.
+ */
+ if (dm_thin_aborted_changes(tc->td)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a
+ * single commit for them in process_deferred_bios().
+ */
+ spin_lock_irqsave(&pool->lock, flags);
+ bio_list_add(&pool->deferred_flush_bios, bio);
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
+{
+ remap_to_origin(tc, bio);
+ issue(tc, bio);
+}
+
+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+ dm_block_t block)
+{
+ remap(tc, bio, block);
+ issue(tc, bio);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Bio endio functions.
+ */
+struct dm_thin_new_mapping {
+ struct list_head list;
+
+ bool pass_discard:1;
+ bool definitely_not_shared:1;
+
+ /*
+ * Track quiescing, copying and zeroing preparation actions. When this
+ * counter hits zero the block is prepared and can be inserted into the
+ * btree.
+ */
+ atomic_t prepare_actions;
+
+ int err;
+ struct thin_c *tc;
+ dm_block_t virt_block;
+ dm_block_t data_block;
+ struct dm_bio_prison_cell *cell, *cell2;
+
+ /*
+ * If the bio covers the whole area of a block then we can avoid
+ * zeroing or copying. Instead this bio is hooked. The bio will
+ * still be in the cell, so care has to be taken to avoid issuing
+ * the bio twice.
+ */
+ struct bio *bio;
+ bio_end_io_t *saved_bi_end_io;
+};
+
+static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
+{
+ struct pool *pool = m->tc->pool;
+
+ if (atomic_dec_and_test(&m->prepare_actions)) {
+ list_add_tail(&m->list, &pool->prepared_mappings);
+ wake_worker(pool);
+ }
+}
+
+static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
+{
+ unsigned long flags;
+ struct pool *pool = m->tc->pool;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ __complete_mapping_preparation(m);
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+ struct dm_thin_new_mapping *m = context;
+
+ m->err = read_err || write_err ? -EIO : 0;
+ complete_mapping_preparation(m);
+}
+
+static void overwrite_endio(struct bio *bio, int err)
+{
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+ struct dm_thin_new_mapping *m = h->overwrite_mapping;
+
+ m->err = err;
+ complete_mapping_preparation(m);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Workqueue.
+ */
+
+/*
+ * Prepared mapping jobs.
+ */
+
+/*
+ * This sends the bios in the cell, except the original holder, back
+ * to the deferred_bios list.
+ */
+static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tc->lock, flags);
+ cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ wake_worker(pool);
+}
+
+static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
+
+struct remap_info {
+ struct thin_c *tc;
+ struct bio_list defer_bios;
+ struct bio_list issue_bios;
+};
+
+static void __inc_remap_and_issue_cell(void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ struct remap_info *info = context;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&cell->bios))) {
+ if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
+ bio_list_add(&info->defer_bios, bio);
+ else {
+ inc_all_io_entry(info->tc->pool, bio);
+
+ /*
+ * We can't issue the bios with the bio prison lock
+ * held, so we add them to a list to issue on
+ * return from this function.
+ */
+ bio_list_add(&info->issue_bios, bio);
+ }
+ }
+}
+
+static void inc_remap_and_issue_cell(struct thin_c *tc,
+ struct dm_bio_prison_cell *cell,
+ dm_block_t block)
+{
+ struct bio *bio;
+ struct remap_info info;
+
+ info.tc = tc;
+ bio_list_init(&info.defer_bios);
+ bio_list_init(&info.issue_bios);
+
+ /*
+ * We have to be careful to inc any bios we're about to issue
+ * before the cell is released, and avoid a race with new bios
+ * being added to the cell.
+ */
+ cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
+ &info, cell);
+
+ while ((bio = bio_list_pop(&info.defer_bios)))
+ thin_defer_bio(tc, bio);
+
+ while ((bio = bio_list_pop(&info.issue_bios)))
+ remap_and_issue(info.tc, bio, block);
+}
+
+static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
+{
+ if (m->bio) {
+ m->bio->bi_end_io = m->saved_bi_end_io;
+ atomic_inc(&m->bio->bi_remaining);
+ }
+ cell_error(m->tc->pool, m->cell);
+ list_del(&m->list);
+ mempool_free(m, m->tc->pool->mapping_pool);
+}
+
+static void process_prepared_mapping(struct dm_thin_new_mapping *m)
+{
+ struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
+ struct bio *bio;
+ int r;
+
+ bio = m->bio;
+ if (bio) {
+ bio->bi_end_io = m->saved_bi_end_io;
+ atomic_inc(&bio->bi_remaining);
+ }
+
+ if (m->err) {
+ cell_error(pool, m->cell);
+ goto out;
+ }
+
+ /*
+ * Commit the prepared block into the mapping btree.
+ * Any I/O for this block arriving after this point will get
+ * remapped to it directly.
+ */
+ r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+ if (r) {
+ metadata_operation_failed(pool, "dm_thin_insert_block", r);
+ cell_error(pool, m->cell);
+ goto out;
+ }
+
+ /*
+ * Release any bios held while the block was being provisioned.
+ * If we are processing a write bio that completely covers the block,
+ * we already processed it so can ignore it now when processing
+ * the bios in the cell.
+ */
+ if (bio) {
+ inc_remap_and_issue_cell(tc, m->cell, m->data_block);
+ bio_endio(bio, 0);
+ } else {
+ inc_all_io_entry(tc->pool, m->cell->holder);
+ remap_and_issue(tc, m->cell->holder, m->data_block);
+ inc_remap_and_issue_cell(tc, m->cell, m->data_block);
+ }
+
+out:
+ list_del(&m->list);
+ mempool_free(m, pool->mapping_pool);
+}
+
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+{
+ struct thin_c *tc = m->tc;
+
+ bio_io_error(m->bio);
+ cell_defer_no_holder(tc, m->cell);
+ cell_defer_no_holder(tc, m->cell2);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+{
+ struct thin_c *tc = m->tc;
+
+ inc_all_io_entry(tc->pool, m->bio);
+ cell_defer_no_holder(tc, m->cell);
+ cell_defer_no_holder(tc, m->cell2);
+
+ if (m->pass_discard)
+ if (m->definitely_not_shared)
+ remap_and_issue(tc, m->bio, m->data_block);
+ else {
+ bool used = false;
+ if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
+ bio_endio(m->bio, 0);
+ else
+ remap_and_issue(tc, m->bio, m->data_block);
+ }
+ else
+ bio_endio(m->bio, 0);
+
+ mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared_discard(struct dm_thin_new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_block(tc->td, m->virt_block);
+ if (r)
+ DMERR_LIMIT("dm_thin_remove_block() failed");
+
+ process_prepared_discard_passdown(m);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+ process_mapping_fn *fn)
+{
+ unsigned long flags;
+ struct list_head maps;
+ struct dm_thin_new_mapping *m, *tmp;
+
+ INIT_LIST_HEAD(&maps);
+ spin_lock_irqsave(&pool->lock, flags);
+ list_splice_init(head, &maps);
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ list_for_each_entry_safe(m, tmp, &maps, list)
+ (*fn)(m);
+}
+
+/*
+ * Deferred bio jobs.
+ */
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
+{
+ return bio->bi_iter.bi_size ==
+ (pool->sectors_per_block << SECTOR_SHIFT);
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ io_overlaps_block(pool, bio);
+}
+
+static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
+ bio_end_io_t *fn)
+{
+ *save = bio->bi_end_io;
+ bio->bi_end_io = fn;
+}
+
+static int ensure_next_mapping(struct pool *pool)
+{
+ if (pool->next_mapping)
+ return 0;
+
+ pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
+
+ return pool->next_mapping ? 0 : -ENOMEM;
+}
+
+static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
+{
+ struct dm_thin_new_mapping *m = pool->next_mapping;
+
+ BUG_ON(!pool->next_mapping);
+
+ memset(m, 0, sizeof(struct dm_thin_new_mapping));
+ INIT_LIST_HEAD(&m->list);
+ m->bio = NULL;
+
+ pool->next_mapping = NULL;
+
+ return m;
+}
+
+static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
+ sector_t begin, sector_t end)
+{
+ int r;
+ struct dm_io_region to;
+
+ to.bdev = tc->pool_dev->bdev;
+ to.sector = begin;
+ to.count = end - begin;
+
+ r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
+ if (r < 0) {
+ DMERR_LIMIT("dm_kcopyd_zero() failed");
+ copy_complete(1, 1, m);
+ }
+}
+
+static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
+ dm_block_t data_block,
+ struct dm_thin_new_mapping *m)
+{
+ struct pool *pool = tc->pool;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ h->overwrite_mapping = m;
+ m->bio = bio;
+ save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
+ inc_all_io_entry(pool, bio);
+ remap_and_issue(tc, bio, data_block);
+}
+
+/*
+ * A partial copy also needs to zero the uncopied region.
+ */
+static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
+ struct dm_dev *origin, dm_block_t data_origin,
+ dm_block_t data_dest,
+ struct dm_bio_prison_cell *cell, struct bio *bio,
+ sector_t len)
+{
+ int r;
+ struct pool *pool = tc->pool;
+ struct dm_thin_new_mapping *m = get_next_mapping(pool);
+
+ m->tc = tc;
+ m->virt_block = virt_block;
+ m->data_block = data_dest;
+ m->cell = cell;
+
+ /*
+ * quiesce action + copy action + an extra reference held for the
+ * duration of this function (we may need to inc later for a
+ * partial zero).
+ */
+ atomic_set(&m->prepare_actions, 3);
+
+ if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
+ complete_mapping_preparation(m); /* already quiesced */
+
+ /*
+ * IO to pool_dev remaps to the pool target's data_dev.
+ *
+ * If the whole block of data is being overwritten, we can issue the
+ * bio immediately. Otherwise we use kcopyd to clone the data first.
+ */
+ if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_dest, m);
+ else {
+ struct dm_io_region from, to;
+
+ from.bdev = origin->bdev;
+ from.sector = data_origin * pool->sectors_per_block;
+ from.count = len;
+
+ to.bdev = tc->pool_dev->bdev;
+ to.sector = data_dest * pool->sectors_per_block;
+ to.count = len;
+
+ r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
+ 0, copy_complete, m);
+ if (r < 0) {
+ DMERR_LIMIT("dm_kcopyd_copy() failed");
+ copy_complete(1, 1, m);
+
+ /*
+ * We allow the zero to be issued, to simplify the
+ * error path. Otherwise we'd need to start
+ * worrying about decrementing the prepare_actions
+ * counter.
+ */
+ }
+
+ /*
+ * Do we need to zero a tail region?
+ */
+ if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
+ atomic_inc(&m->prepare_actions);
+ ll_zero(tc, m,
+ data_dest * pool->sectors_per_block + len,
+ (data_dest + 1) * pool->sectors_per_block);
+ }
+ }
+
+ complete_mapping_preparation(m); /* drop our ref */
+}
+
+static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_origin, dm_block_t data_dest,
+ struct dm_bio_prison_cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->pool_dev,
+ data_origin, data_dest, cell, bio,
+ tc->pool->sectors_per_block);
+}
+
+static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_block, struct dm_bio_prison_cell *cell,
+ struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ struct dm_thin_new_mapping *m = get_next_mapping(pool);
+
+ atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
+ m->tc = tc;
+ m->virt_block = virt_block;
+ m->data_block = data_block;
+ m->cell = cell;
+
+ /*
+ * If the whole block of data is being overwritten or we are not
+ * zeroing pre-existing data, we can issue the bio immediately.
+ * Otherwise we use kcopyd to zero the data first.
+ */
+ if (!pool->pf.zero_new_blocks)
+ process_prepared_mapping(m);
+
+ else if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_block, m);
+
+ else
+ ll_zero(tc, m,
+ data_block * pool->sectors_per_block,
+ (data_block + 1) * pool->sectors_per_block);
+}
+
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_dest,
+ struct dm_bio_prison_cell *cell, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ sector_t virt_block_begin = virt_block * pool->sectors_per_block;
+ sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
+
+ if (virt_block_end <= tc->origin_size)
+ schedule_copy(tc, virt_block, tc->origin_dev,
+ virt_block, data_dest, cell, bio,
+ pool->sectors_per_block);
+
+ else if (virt_block_begin < tc->origin_size)
+ schedule_copy(tc, virt_block, tc->origin_dev,
+ virt_block, data_dest, cell, bio,
+ tc->origin_size - virt_block_begin);
+
+ else
+ schedule_zero(tc, virt_block, data_dest, cell, bio);
+}
+
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+
+static void check_for_space(struct pool *pool)
+{
+ int r;
+ dm_block_t nr_free;
+
+ if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
+ return;
+
+ r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
+ if (r)
+ return;
+
+ if (nr_free)
+ set_pool_mode(pool, PM_WRITE);
+}
+
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ * Many callers don't care about the return value.
+ */
+static int commit(struct pool *pool)
+{
+ int r;
+
+ if (get_pool_mode(pool) >= PM_READ_ONLY)
+ return -EINVAL;
+
+ r = dm_pool_commit_metadata(pool->pmd);
+ if (r)
+ metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
+ else
+ check_for_space(pool);
+
+ return r;
+}
+
+static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
+{
+ unsigned long flags;
+
+ if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
+ DMWARN("%s: reached low water mark for data device: sending event.",
+ dm_device_name(pool->pool_md));
+ spin_lock_irqsave(&pool->lock, flags);
+ pool->low_water_triggered = true;
+ spin_unlock_irqrestore(&pool->lock, flags);
+ dm_table_event(pool->ti->table);
+ }
+}
+
+static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+{
+ int r;
+ dm_block_t free_blocks;
+ struct pool *pool = tc->pool;
+
+ if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
+ return -EINVAL;
+
+ r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
+ return r;
+ }
+
+ check_low_water_mark(pool, free_blocks);
+
+ if (!free_blocks) {
+ /*
+ * Try to commit to see if that will free up some
+ * more space.
+ */
+ r = commit(pool);
+ if (r)
+ return r;
+
+ r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
+ return r;
+ }
+
+ if (!free_blocks) {
+ set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
+ return -ENOSPC;
+ }
+ }
+
+ r = dm_pool_alloc_data_block(pool->pmd, result);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
+ return r;
+ }
+
+ return 0;
+}
+
+/*
+ * If we have run out of space, queue bios until the device is
+ * resumed, presumably after having been reloaded with more space.
+ */
+static void retry_on_resume(struct bio *bio)
+{
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+ struct thin_c *tc = h->tc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tc->lock, flags);
+ bio_list_add(&tc->retry_on_resume_list, bio);
+ spin_unlock_irqrestore(&tc->lock, flags);
+}
+
+static int should_error_unserviceable_bio(struct pool *pool)
+{
+ enum pool_mode m = get_pool_mode(pool);
+
+ switch (m) {
+ case PM_WRITE:
+ /* Shouldn't get here */
+ DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
+ return -EIO;
+
+ case PM_OUT_OF_DATA_SPACE:
+ return pool->pf.error_if_no_space ? -ENOSPC : 0;
+
+ case PM_READ_ONLY:
+ case PM_FAIL:
+ return -EIO;
+ default:
+ /* Shouldn't get here */
+ DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
+ return -EIO;
+ }
+}
+
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+ int error = should_error_unserviceable_bio(pool);
+
+ if (error)
+ bio_endio(bio, error);
+ else
+ retry_on_resume(bio);
+}
+
+static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ struct bio *bio;
+ struct bio_list bios;
+ int error;
+
+ error = should_error_unserviceable_bio(pool);
+ if (error) {
+ cell_error_with_code(pool, cell, error);
+ return;
+ }
+
+ bio_list_init(&bios);
+ cell_release(pool, cell, &bios);
+
+ while ((bio = bio_list_pop(&bios)))
+ retry_on_resume(bio);
+}
+
+static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ int r;
+ struct bio *bio = cell->holder;
+ struct pool *pool = tc->pool;
+ struct dm_bio_prison_cell *cell2;
+ struct dm_cell_key key2;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_lookup_result lookup_result;
+ struct dm_thin_new_mapping *m;
+
+ if (tc->requeue_mode) {
+ cell_requeue(pool, cell);
+ return;
+ }
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ /*
+ * Check nobody is fiddling with this pool block. This can
+ * happen if someone's in the process of breaking sharing
+ * on this block.
+ */
+ build_data_key(tc->td, lookup_result.block, &key2);
+ if (bio_detain(tc->pool, &key2, bio, &cell2)) {
+ cell_defer_no_holder(tc, cell);
+ break;
+ }
+
+ if (io_overlaps_block(pool, bio)) {
+ /*
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
+ */
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->pass_discard = pool->pf.discard_passdown;
+ m->definitely_not_shared = !lookup_result.shared;
+ m->virt_block = block;
+ m->data_block = lookup_result.block;
+ m->cell = cell;
+ m->cell2 = cell2;
+ m->bio = bio;
+
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+
+ } else {
+ inc_all_io_entry(pool, bio);
+ cell_defer_no_holder(tc, cell);
+ cell_defer_no_holder(tc, cell2);
+
+ /*
+ * The DM core makes sure that the discard doesn't span
+ * a block boundary. So we submit the discard of a
+ * partial block appropriately.
+ */
+ if ((!lookup_result.shared) && pool->pf.discard_passdown)
+ remap_and_issue(tc, bio, lookup_result.block);
+ else
+ bio_endio(bio, 0);
+ }
+ break;
+
+ case -ENODATA:
+ /*
+ * It isn't provisioned, just forget it.
+ */
+ cell_defer_no_holder(tc, cell);
+ bio_endio(bio, 0);
+ break;
+
+ default:
+ DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+ __func__, r);
+ cell_defer_no_holder(tc, cell);
+ bio_io_error(bio);
+ break;
+ }
+}
+
+static void process_discard_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct dm_bio_prison_cell *cell;
+ struct dm_cell_key key;
+ dm_block_t block = get_bio_block(tc, bio);
+
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool, &key, bio, &cell))
+ return;
+
+ process_discard_cell(tc, cell);
+}
+
+static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
+ struct dm_cell_key *key,
+ struct dm_thin_lookup_result *lookup_result,
+ struct dm_bio_prison_cell *cell)
+{
+ int r;
+ dm_block_t data_block;
+ struct pool *pool = tc->pool;
+
+ r = alloc_data_block(tc, &data_block);
+ switch (r) {
+ case 0:
+ schedule_internal_copy(tc, block, lookup_result->block,
+ data_block, cell, bio);
+ break;
+
+ case -ENOSPC:
+ retry_bios_on_resume(pool, cell);
+ break;
+
+ default:
+ DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
+ __func__, r);
+ cell_error(pool, cell);
+ break;
+ }
+}
+
+static void __remap_and_issue_shared_cell(void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ struct remap_info *info = context;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&cell->bios))) {
+ if ((bio_data_dir(bio) == WRITE) ||
+ (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))
+ bio_list_add(&info->defer_bios, bio);
+ else {
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
+
+ h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
+ inc_all_io_entry(info->tc->pool, bio);
+ bio_list_add(&info->issue_bios, bio);
+ }
+ }
+}
+
+static void remap_and_issue_shared_cell(struct thin_c *tc,
+ struct dm_bio_prison_cell *cell,
+ dm_block_t block)
+{
+ struct bio *bio;
+ struct remap_info info;
+
+ info.tc = tc;
+ bio_list_init(&info.defer_bios);
+ bio_list_init(&info.issue_bios);
+
+ cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
+ &info, cell);
+
+ while ((bio = bio_list_pop(&info.defer_bios)))
+ thin_defer_bio(tc, bio);
+
+ while ((bio = bio_list_pop(&info.issue_bios)))
+ remap_and_issue(tc, bio, block);
+}
+
+static void process_shared_bio(struct thin_c *tc, struct bio *bio,
+ dm_block_t block,
+ struct dm_thin_lookup_result *lookup_result,
+ struct dm_bio_prison_cell *virt_cell)
+{
+ struct dm_bio_prison_cell *data_cell;
+ struct pool *pool = tc->pool;
+ struct dm_cell_key key;
+
+ /*
+ * If cell is already occupied, then sharing is already in the process
+ * of being broken so we have nothing further to do here.
+ */
+ build_data_key(tc->td, lookup_result->block, &key);
+ if (bio_detain(pool, &key, bio, &data_cell)) {
+ cell_defer_no_holder(tc, virt_cell);
+ return;
+ }
+
+ if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
+ break_sharing(tc, bio, block, &key, lookup_result, data_cell);
+ cell_defer_no_holder(tc, virt_cell);
+ } else {
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
+ inc_all_io_entry(pool, bio);
+ remap_and_issue(tc, bio, lookup_result->block);
+
+ remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
+ remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
+ }
+}
+
+static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
+ struct dm_bio_prison_cell *cell)
+{
+ int r;
+ dm_block_t data_block;
+ struct pool *pool = tc->pool;
+
+ /*
+ * Remap empty bios (flushes) immediately, without provisioning.
+ */
+ if (!bio->bi_iter.bi_size) {
+ inc_all_io_entry(pool, bio);
+ cell_defer_no_holder(tc, cell);
+
+ remap_and_issue(tc, bio, 0);
+ return;
+ }
+
+ /*
+ * Fill read bios with zeroes and complete them immediately.
+ */
+ if (bio_data_dir(bio) == READ) {
+ zero_fill_bio(bio);
+ cell_defer_no_holder(tc, cell);
+ bio_endio(bio, 0);
+ return;
+ }
+
+ r = alloc_data_block(tc, &data_block);
+ switch (r) {
+ case 0:
+ if (tc->origin_dev)
+ schedule_external_copy(tc, block, data_block, cell, bio);
+ else
+ schedule_zero(tc, block, data_block, cell, bio);
+ break;
+
+ case -ENOSPC:
+ retry_bios_on_resume(pool, cell);
+ break;
+
+ default:
+ DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
+ __func__, r);
+ cell_error(pool, cell);
+ break;
+ }
+}
+
+static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ int r;
+ struct pool *pool = tc->pool;
+ struct bio *bio = cell->holder;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_lookup_result lookup_result;
+
+ if (tc->requeue_mode) {
+ cell_requeue(pool, cell);
+ return;
+ }
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ if (lookup_result.shared)
+ process_shared_bio(tc, bio, block, &lookup_result, cell);
+ else {
+ inc_all_io_entry(pool, bio);
+ remap_and_issue(tc, bio, lookup_result.block);
+ inc_remap_and_issue_cell(tc, cell, lookup_result.block);
+ }
+ break;
+
+ case -ENODATA:
+ if (bio_data_dir(bio) == READ && tc->origin_dev) {
+ inc_all_io_entry(pool, bio);
+ cell_defer_no_holder(tc, cell);
+
+ if (bio_end_sector(bio) <= tc->origin_size)
+ remap_to_origin_and_issue(tc, bio);
+
+ else if (bio->bi_iter.bi_sector < tc->origin_size) {
+ zero_fill_bio(bio);
+ bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
+ remap_to_origin_and_issue(tc, bio);
+
+ } else {
+ zero_fill_bio(bio);
+ bio_endio(bio, 0);
+ }
+ } else
+ provision_block(tc, bio, block, cell);
+ break;
+
+ default:
+ DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+ __func__, r);
+ cell_defer_no_holder(tc, cell);
+ bio_io_error(bio);
+ break;
+ }
+}
+
+static void process_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_bio_prison_cell *cell;
+ struct dm_cell_key key;
+
+ /*
+ * If cell is already occupied, then the block is already
+ * being provisioned so we have nothing further to do here.
+ */
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(pool, &key, bio, &cell))
+ return;
+
+ process_cell(tc, cell);
+}
+
+static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
+ struct dm_bio_prison_cell *cell)
+{
+ int r;
+ int rw = bio_data_dir(bio);
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_lookup_result lookup_result;
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
+ handle_unserviceable_bio(tc->pool, bio);
+ if (cell)
+ cell_defer_no_holder(tc, cell);
+ } else {
+ inc_all_io_entry(tc->pool, bio);
+ remap_and_issue(tc, bio, lookup_result.block);
+ if (cell)
+ inc_remap_and_issue_cell(tc, cell, lookup_result.block);
+ }
+ break;
+
+ case -ENODATA:
+ if (cell)
+ cell_defer_no_holder(tc, cell);
+ if (rw != READ) {
+ handle_unserviceable_bio(tc->pool, bio);
+ break;
+ }
+
+ if (tc->origin_dev) {
+ inc_all_io_entry(tc->pool, bio);
+ remap_to_origin_and_issue(tc, bio);
+ break;
+ }
+
+ zero_fill_bio(bio);
+ bio_endio(bio, 0);
+ break;
+
+ default:
+ DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
+ __func__, r);
+ if (cell)
+ cell_defer_no_holder(tc, cell);
+ bio_io_error(bio);
+ break;
+ }
+}
+
+static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
+{
+ __process_bio_read_only(tc, bio, NULL);
+}
+
+static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ __process_bio_read_only(tc, cell->holder, cell);
+}
+
+static void process_bio_success(struct thin_c *tc, struct bio *bio)
+{
+ bio_endio(bio, 0);
+}
+
+static void process_bio_fail(struct thin_c *tc, struct bio *bio)
+{
+ bio_io_error(bio);
+}
+
+static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ cell_success(tc->pool, cell);
+}
+
+static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ cell_error(tc->pool, cell);
+}
+
+/*
+ * FIXME: should we also commit due to size of transaction, measured in
+ * metadata blocks?
+ */
+static int need_commit_due_to_time(struct pool *pool)
+{
+ return !time_in_range(jiffies, pool->last_commit_jiffies,
+ pool->last_commit_jiffies + COMMIT_PERIOD);
+}
+
+#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
+#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
+
+static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
+{
+ struct rb_node **rbp, *parent;
+ struct dm_thin_endio_hook *pbd;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
+
+ rbp = &tc->sort_bio_list.rb_node;
+ parent = NULL;
+ while (*rbp) {
+ parent = *rbp;
+ pbd = thin_pbd(parent);
+
+ if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
+ rbp = &(*rbp)->rb_left;
+ else
+ rbp = &(*rbp)->rb_right;
+ }
+
+ pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+ rb_link_node(&pbd->rb_node, parent, rbp);
+ rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
+}
+
+static void __extract_sorted_bios(struct thin_c *tc)
+{
+ struct rb_node *node;
+ struct dm_thin_endio_hook *pbd;
+ struct bio *bio;
+
+ for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
+ pbd = thin_pbd(node);
+ bio = thin_bio(pbd);
+
+ bio_list_add(&tc->deferred_bio_list, bio);
+ rb_erase(&pbd->rb_node, &tc->sort_bio_list);
+ }
+
+ WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
+}
+
+static void __sort_thin_deferred_bios(struct thin_c *tc)
+{
+ struct bio *bio;
+ struct bio_list bios;
+
+ bio_list_init(&bios);
+ bio_list_merge(&bios, &tc->deferred_bio_list);
+ bio_list_init(&tc->deferred_bio_list);
+
+ /* Sort deferred_bio_list using rb-tree */
+ while ((bio = bio_list_pop(&bios)))
+ __thin_bio_rb_add(tc, bio);
+
+ /*
+ * Transfer the sorted bios in sort_bio_list back to
+ * deferred_bio_list to allow lockless submission of
+ * all bios.
+ */
+ __extract_sorted_bios(tc);
+}
+
+static void process_thin_deferred_bios(struct thin_c *tc)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+ struct bio *bio;
+ struct bio_list bios;
+ struct blk_plug plug;
+ unsigned count = 0;
+
+ if (tc->requeue_mode) {
+ error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
+ return;
+ }
+
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&tc->lock, flags);
+
+ if (bio_list_empty(&tc->deferred_bio_list)) {
+ spin_unlock_irqrestore(&tc->lock, flags);
+ return;
+ }
+
+ __sort_thin_deferred_bios(tc);
+
+ bio_list_merge(&bios, &tc->deferred_bio_list);
+ bio_list_init(&tc->deferred_bio_list);
+
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ blk_start_plug(&plug);
+ while ((bio = bio_list_pop(&bios))) {
+ /*
+ * If we've got no free new_mapping structs, and processing
+ * this bio might require one, we pause until there are some
+ * prepared mappings to process.
+ */
+ if (ensure_next_mapping(pool)) {
+ spin_lock_irqsave(&tc->lock, flags);
+ bio_list_add(&tc->deferred_bio_list, bio);
+ bio_list_merge(&tc->deferred_bio_list, &bios);
+ spin_unlock_irqrestore(&tc->lock, flags);
+ break;
+ }
+
+ if (bio->bi_rw & REQ_DISCARD)
+ pool->process_discard(tc, bio);
+ else
+ pool->process_bio(tc, bio);
+
+ if ((count++ & 127) == 0) {
+ throttle_work_update(&pool->throttle);
+ dm_pool_issue_prefetches(pool->pmd);
+ }
+ }
+ blk_finish_plug(&plug);
+}
+
+static int cmp_cells(const void *lhs, const void *rhs)
+{
+ struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
+ struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
+
+ BUG_ON(!lhs_cell->holder);
+ BUG_ON(!rhs_cell->holder);
+
+ if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
+ return -1;
+
+ if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
+ return 1;
+
+ return 0;
+}
+
+static unsigned sort_cells(struct pool *pool, struct list_head *cells)
+{
+ unsigned count = 0;
+ struct dm_bio_prison_cell *cell, *tmp;
+
+ list_for_each_entry_safe(cell, tmp, cells, user_list) {
+ if (count >= CELL_SORT_ARRAY_SIZE)
+ break;
+
+ pool->cell_sort_array[count++] = cell;
+ list_del(&cell->user_list);
+ }
+
+ sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
+
+ return count;
+}
+
+static void process_thin_deferred_cells(struct thin_c *tc)
+{
+ struct pool *pool = tc->pool;
+ unsigned long flags;
+ struct list_head cells;
+ struct dm_bio_prison_cell *cell;
+ unsigned i, j, count;
+
+ INIT_LIST_HEAD(&cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice_init(&tc->deferred_cells, &cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ if (list_empty(&cells))
+ return;
+
+ do {
+ count = sort_cells(tc->pool, &cells);
+
+ for (i = 0; i < count; i++) {
+ cell = pool->cell_sort_array[i];
+ BUG_ON(!cell->holder);
+
+ /*
+ * If we've got no free new_mapping structs, and processing
+ * this bio might require one, we pause until there are some
+ * prepared mappings to process.
+ */
+ if (ensure_next_mapping(pool)) {
+ for (j = i; j < count; j++)
+ list_add(&pool->cell_sort_array[j]->user_list, &cells);
+
+ spin_lock_irqsave(&tc->lock, flags);
+ list_splice(&cells, &tc->deferred_cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+ return;
+ }
+
+ if (cell->holder->bi_rw & REQ_DISCARD)
+ pool->process_discard_cell(tc, cell);
+ else
+ pool->process_cell(tc, cell);
+ }
+ } while (!list_empty(&cells));
+}
+
+static void thin_get(struct thin_c *tc);
+static void thin_put(struct thin_c *tc);
+
+/*
+ * We can't hold rcu_read_lock() around code that can block. So we
+ * find a thin with the rcu lock held; bump a refcount; then drop
+ * the lock.
+ */
+static struct thin_c *get_first_thin(struct pool *pool)
+{
+ struct thin_c *tc = NULL;
+
+ rcu_read_lock();
+ if (!list_empty(&pool->active_thins)) {
+ tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
+ thin_get(tc);
+ }
+ rcu_read_unlock();
+
+ return tc;
+}
+
+static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
+{
+ struct thin_c *old_tc = tc;
+
+ rcu_read_lock();
+ list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
+ thin_get(tc);
+ thin_put(old_tc);
+ rcu_read_unlock();
+ return tc;
+ }
+ thin_put(old_tc);
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void process_deferred_bios(struct pool *pool)
+{
+ unsigned long flags;
+ struct bio *bio;
+ struct bio_list bios;
+ struct thin_c *tc;
+
+ tc = get_first_thin(pool);
+ while (tc) {
+ process_thin_deferred_cells(tc);
+ process_thin_deferred_bios(tc);
+ tc = get_next_thin(pool, tc);
+ }
+
+ /*
+ * If there are any deferred flush bios, we must commit
+ * the metadata before issuing them.
+ */
+ bio_list_init(&bios);
+ spin_lock_irqsave(&pool->lock, flags);
+ bio_list_merge(&bios, &pool->deferred_flush_bios);
+ bio_list_init(&pool->deferred_flush_bios);
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ if (bio_list_empty(&bios) &&
+ !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
+ return;
+
+ if (commit(pool)) {
+ while ((bio = bio_list_pop(&bios)))
+ bio_io_error(bio);
+ return;
+ }
+ pool->last_commit_jiffies = jiffies;
+
+ while ((bio = bio_list_pop(&bios)))
+ generic_make_request(bio);
+}
+
+static void do_worker(struct work_struct *ws)
+{
+ struct pool *pool = container_of(ws, struct pool, worker);
+
+ throttle_work_start(&pool->throttle);
+ dm_pool_issue_prefetches(pool->pmd);
+ throttle_work_update(&pool->throttle);
+ process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
+ throttle_work_update(&pool->throttle);
+ process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
+ throttle_work_update(&pool->throttle);
+ process_deferred_bios(pool);
+ throttle_work_complete(&pool->throttle);
+}
+
+/*
+ * We want to commit periodically so that not too much
+ * unwritten data builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+ struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
+ wake_worker(pool);
+ queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
+}
+
+/*
+ * We're holding onto IO to allow userland time to react. After the
+ * timeout either the pool will have been resized (and thus back in
+ * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ */
+static void do_no_space_timeout(struct work_struct *ws)
+{
+ struct pool *pool = container_of(to_delayed_work(ws), struct pool,
+ no_space_timeout);
+
+ if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
+ set_pool_mode(pool, PM_READ_ONLY);
+}
+
+/*----------------------------------------------------------------*/
+
+struct pool_work {
+ struct work_struct worker;
+ struct completion complete;
+};
+
+static struct pool_work *to_pool_work(struct work_struct *ws)
+{
+ return container_of(ws, struct pool_work, worker);
+}
+
+static void pool_work_complete(struct pool_work *pw)
+{
+ complete(&pw->complete);
+}
+
+static void pool_work_wait(struct pool_work *pw, struct pool *pool,
+ void (*fn)(struct work_struct *))
+{
+ INIT_WORK_ONSTACK(&pw->worker, fn);
+ init_completion(&pw->complete);
+ queue_work(pool->wq, &pw->worker);
+ wait_for_completion(&pw->complete);
+}
+
+/*----------------------------------------------------------------*/
+
+struct noflush_work {
+ struct pool_work pw;
+ struct thin_c *tc;
+};
+
+static struct noflush_work *to_noflush(struct work_struct *ws)
+{
+ return container_of(to_pool_work(ws), struct noflush_work, pw);
+}
+
+static void do_noflush_start(struct work_struct *ws)
+{
+ struct noflush_work *w = to_noflush(ws);
+ w->tc->requeue_mode = true;
+ requeue_io(w->tc);
+ pool_work_complete(&w->pw);
+}
+
+static void do_noflush_stop(struct work_struct *ws)
+{
+ struct noflush_work *w = to_noflush(ws);
+ w->tc->requeue_mode = false;
+ pool_work_complete(&w->pw);
+}
+
+static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
+{
+ struct noflush_work w;
+
+ w.tc = tc;
+ pool_work_wait(&w.pw, tc->pool, fn);
+}
+
+/*----------------------------------------------------------------*/
+
+static enum pool_mode get_pool_mode(struct pool *pool)
+{
+ return pool->pf.mode;
+}
+
+static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
+{
+ dm_table_event(pool->ti->table);
+ DMINFO("%s: switching pool to %s mode",
+ dm_device_name(pool->pool_md), new_mode);
+}
+
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
+{
+ struct pool_c *pt = pool->ti->private;
+ bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+ enum pool_mode old_mode = get_pool_mode(pool);
+ unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
+
+ /*
+ * Never allow the pool to transition to PM_WRITE mode if user
+ * intervention is required to verify metadata and data consistency.
+ */
+ if (new_mode == PM_WRITE && needs_check) {
+ DMERR("%s: unable to switch pool to write mode until repaired.",
+ dm_device_name(pool->pool_md));
+ if (old_mode != new_mode)
+ new_mode = old_mode;
+ else
+ new_mode = PM_READ_ONLY;
+ }
+ /*
+ * If we were in PM_FAIL mode, rollback of metadata failed. We're
+ * not going to recover without a thin_repair. So we never let the
+ * pool move out of the old mode.
+ */
+ if (old_mode == PM_FAIL)
+ new_mode = old_mode;
+
+ switch (new_mode) {
+ case PM_FAIL:
+ if (old_mode != new_mode)
+ notify_of_pool_mode_change(pool, "failure");
+ dm_pool_metadata_read_only(pool->pmd);
+ pool->process_bio = process_bio_fail;
+ pool->process_discard = process_bio_fail;
+ pool->process_cell = process_cell_fail;
+ pool->process_discard_cell = process_cell_fail;
+ pool->process_prepared_mapping = process_prepared_mapping_fail;
+ pool->process_prepared_discard = process_prepared_discard_fail;
+
+ error_retry_list(pool);
+ break;
+
+ case PM_READ_ONLY:
+ if (old_mode != new_mode)
+ notify_of_pool_mode_change(pool, "read-only");
+ dm_pool_metadata_read_only(pool->pmd);
+ pool->process_bio = process_bio_read_only;
+ pool->process_discard = process_bio_success;
+ pool->process_cell = process_cell_read_only;
+ pool->process_discard_cell = process_cell_success;
+ pool->process_prepared_mapping = process_prepared_mapping_fail;
+ pool->process_prepared_discard = process_prepared_discard_passdown;
+
+ error_retry_list(pool);
+ break;
+
+ case PM_OUT_OF_DATA_SPACE:
+ /*
+ * Ideally we'd never hit this state; the low water mark
+ * would trigger userland to extend the pool before we
+ * completely run out of data space. However, many small
+ * IOs to unprovisioned space can consume data space at an
+ * alarming rate. Adjust your low water mark if you're
+ * frequently seeing this mode.
+ */
+ if (old_mode != new_mode)
+ notify_of_pool_mode_change(pool, "out-of-data-space");
+ pool->process_bio = process_bio_read_only;
+ pool->process_discard = process_discard_bio;
+ pool->process_cell = process_cell_read_only;
+ pool->process_discard_cell = process_discard_cell;
+ pool->process_prepared_mapping = process_prepared_mapping;
+ pool->process_prepared_discard = process_prepared_discard;
+
+ if (!pool->pf.error_if_no_space && no_space_timeout)
+ queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
+ break;
+
+ case PM_WRITE:
+ if (old_mode != new_mode)
+ notify_of_pool_mode_change(pool, "write");
+ dm_pool_metadata_read_write(pool->pmd);
+ pool->process_bio = process_bio;
+ pool->process_discard = process_discard_bio;
+ pool->process_cell = process_cell;
+ pool->process_discard_cell = process_discard_cell;
+ pool->process_prepared_mapping = process_prepared_mapping;
+ pool->process_prepared_discard = process_prepared_discard;
+ break;
+ }
+
+ pool->pf.mode = new_mode;
+ /*
+ * The pool mode may have changed, sync it so bind_control_target()
+ * doesn't cause an unexpected mode transition on resume.
+ */
+ pt->adjusted_pf.mode = new_mode;
+}
+
+static void abort_transaction(struct pool *pool)
+{
+ const char *dev_name = dm_device_name(pool->pool_md);
+
+ DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+ if (dm_pool_abort_metadata(pool->pmd)) {
+ DMERR("%s: failed to abort metadata transaction", dev_name);
+ set_pool_mode(pool, PM_FAIL);
+ }
+
+ if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+ DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+ set_pool_mode(pool, PM_FAIL);
+ }
+}
+
+static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+{
+ DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+ dm_device_name(pool->pool_md), op, r);
+
+ abort_transaction(pool);
+ set_pool_mode(pool, PM_READ_ONLY);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Mapping functions.
+ */
+
+/*
+ * Called only while mapping a thin bio to hand it over to the workqueue.
+ */
+static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
+{
+ unsigned long flags;
+ struct pool *pool = tc->pool;
+
+ spin_lock_irqsave(&tc->lock, flags);
+ bio_list_add(&tc->deferred_bio_list, bio);
+ spin_unlock_irqrestore(&tc->lock, flags);
+
+ wake_worker(pool);
+}
+
+static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+
+ throttle_lock(&pool->throttle);
+ thin_defer_bio(tc, bio);
+ throttle_unlock(&pool->throttle);
+}
+
+static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+{
+ unsigned long flags;
+ struct pool *pool = tc->pool;
+
+ throttle_lock(&pool->throttle);
+ spin_lock_irqsave(&tc->lock, flags);
+ list_add_tail(&cell->user_list, &tc->deferred_cells);
+ spin_unlock_irqrestore(&tc->lock, flags);
+ throttle_unlock(&pool->throttle);
+
+ wake_worker(pool);
+}
+
+static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ h->tc = tc;
+ h->shared_read_entry = NULL;
+ h->all_io_entry = NULL;
+ h->overwrite_mapping = NULL;
+}
+
+/*
+ * Non-blocking function called from the thin target's map function.
+ */
+static int thin_bio_map(struct dm_target *ti, struct bio *bio)
+{
+ int r;
+ struct thin_c *tc = ti->private;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_device *td = tc->td;
+ struct dm_thin_lookup_result result;
+ struct dm_bio_prison_cell *virt_cell, *data_cell;
+ struct dm_cell_key key;
+
+ thin_hook_bio(tc, bio);
+
+ if (tc->requeue_mode) {
+ bio_endio(bio, DM_ENDIO_REQUEUE);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ if (get_pool_mode(tc->pool) == PM_FAIL) {
+ bio_io_error(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
+ thin_defer_bio_with_throttle(tc, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /*
+ * We must hold the virtual cell before doing the lookup, otherwise
+ * there's a race with discard.
+ */
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool, &key, bio, &virt_cell))
+ return DM_MAPIO_SUBMITTED;
+
+ r = dm_thin_find_block(td, block, 0, &result);
+
+ /*
+ * Note that we defer readahead too.
+ */
+ switch (r) {
+ case 0:
+ if (unlikely(result.shared)) {
+ /*
+ * We have a race condition here between the
+ * result.shared value returned by the lookup and
+ * snapshot creation, which may cause new
+ * sharing.
+ *
+ * To avoid this always quiesce the origin before
+ * taking the snap. You want to do this anyway to
+ * ensure a consistent application view
+ * (i.e. lockfs).
+ *
+ * More distant ancestors are irrelevant. The
+ * shared flag will be set in their case.
+ */
+ thin_defer_cell(tc, virt_cell);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ build_data_key(tc->td, result.block, &key);
+ if (bio_detain(tc->pool, &key, bio, &data_cell)) {
+ cell_defer_no_holder(tc, virt_cell);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ inc_all_io_entry(tc->pool, bio);
+ cell_defer_no_holder(tc, data_cell);
+ cell_defer_no_holder(tc, virt_cell);
+
+ remap(tc, bio, result.block);
+ return DM_MAPIO_REMAPPED;
+
+ case -ENODATA:
+ case -EWOULDBLOCK:
+ thin_defer_cell(tc, virt_cell);
+ return DM_MAPIO_SUBMITTED;
+
+ default:
+ /*
+ * Must always call bio_io_error on failure.
+ * dm_thin_find_block can fail with -EINVAL if the
+ * pool is switched to fail-io mode.
+ */
+ bio_io_error(bio);
+ cell_defer_no_holder(tc, virt_cell);
+ return DM_MAPIO_SUBMITTED;
+ }
+}
+
+static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+ struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
+ struct request_queue *q;
+
+ if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
+ return 1;
+
+ q = bdev_get_queue(pt->data_dev->bdev);
+ return bdi_congested(&q->backing_dev_info, bdi_bits);
+}
+
+static void requeue_bios(struct pool *pool)
+{
+ unsigned long flags;
+ struct thin_c *tc;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tc, &pool->active_thins, list) {
+ spin_lock_irqsave(&tc->lock, flags);
+ bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
+ bio_list_init(&tc->retry_on_resume_list);
+ spin_unlock_irqrestore(&tc->lock, flags);
+ }
+ rcu_read_unlock();
+}
+
+/*----------------------------------------------------------------
+ * Binding of control targets to a pool object
+ *--------------------------------------------------------------*/
+static bool data_dev_supports_discard(struct pool_c *pt)
+{
+ struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
+
+ return q && blk_queue_discard(q);
+}
+
+static bool is_factor(sector_t block_size, uint32_t n)
+{
+ return !sector_div(block_size, n);
+}
+
+/*
+ * If discard_passdown was enabled verify that the data device
+ * supports discards. Disable discard_passdown if not.
+ */
+static void disable_passdown_if_not_supported(struct pool_c *pt)
+{
+ struct pool *pool = pt->pool;
+ struct block_device *data_bdev = pt->data_dev->bdev;
+ struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
+ sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
+ const char *reason = NULL;
+ char buf[BDEVNAME_SIZE];
+
+ if (!pt->adjusted_pf.discard_passdown)
+ return;
+
+ if (!data_dev_supports_discard(pt))
+ reason = "discard unsupported";
+
+ else if (data_limits->max_discard_sectors < pool->sectors_per_block)
+ reason = "max discard sectors smaller than a block";
+
+ else if (data_limits->discard_granularity > block_size)
+ reason = "discard granularity larger than a block";
+
+ else if (!is_factor(block_size, data_limits->discard_granularity))
+ reason = "discard granularity not a factor of block size";
+
+ if (reason) {
+ DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
+ pt->adjusted_pf.discard_passdown = false;
+ }
+}
+
+static int bind_control_target(struct pool *pool, struct dm_target *ti)
+{
+ struct pool_c *pt = ti->private;
+
+ /*
+ * We want to make sure that a pool in PM_FAIL mode is never upgraded.
+ */
+ enum pool_mode old_mode = get_pool_mode(pool);
+ enum pool_mode new_mode = pt->adjusted_pf.mode;
+
+ /*
+ * Don't change the pool's mode until set_pool_mode() below.
+ * Otherwise the pool's process_* function pointers may
+ * not match the desired pool mode.
+ */
+ pt->adjusted_pf.mode = old_mode;
+
+ pool->ti = ti;
+ pool->pf = pt->adjusted_pf;
+ pool->low_water_blocks = pt->low_water_blocks;
+
+ set_pool_mode(pool, new_mode);
+
+ return 0;
+}
+
+static void unbind_control_target(struct pool *pool, struct dm_target *ti)
+{
+ if (pool->ti == ti)
+ pool->ti = NULL;
+}
+
+/*----------------------------------------------------------------
+ * Pool creation
+ *--------------------------------------------------------------*/
+/* Initialize pool features. */
+static void pool_features_init(struct pool_features *pf)
+{
+ pf->mode = PM_WRITE;
+ pf->zero_new_blocks = true;
+ pf->discard_enabled = true;
+ pf->discard_passdown = true;
+ pf->error_if_no_space = false;
+}
+
+static void __pool_destroy(struct pool *pool)
+{
+ __pool_table_remove(pool);
+
+ vfree(pool->cell_sort_array);
+ if (dm_pool_metadata_close(pool->pmd) < 0)
+ DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
+
+ dm_bio_prison_destroy(pool->prison);
+ dm_kcopyd_client_destroy(pool->copier);
+
+ if (pool->wq)
+ destroy_workqueue(pool->wq);
+
+ if (pool->next_mapping)
+ mempool_free(pool->next_mapping, pool->mapping_pool);
+ mempool_destroy(pool->mapping_pool);
+ dm_deferred_set_destroy(pool->shared_read_ds);
+ dm_deferred_set_destroy(pool->all_io_ds);
+ kfree(pool);
+}
+
+static struct kmem_cache *_new_mapping_cache;
+
+static struct pool *pool_create(struct mapped_device *pool_md,
+ struct block_device *metadata_dev,
+ unsigned long block_size,
+ int read_only, char **error)
+{
+ int r;
+ void *err_p;
+ struct pool *pool;
+ struct dm_pool_metadata *pmd;
+ bool format_device = read_only ? false : true;
+
+ pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
+ if (IS_ERR(pmd)) {
+ *error = "Error creating metadata object";
+ return (struct pool *)pmd;
+ }
+
+ pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool) {
+ *error = "Error allocating memory for pool";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_pool;
+ }
+
+ pool->pmd = pmd;
+ pool->sectors_per_block = block_size;
+ if (block_size & (block_size - 1))
+ pool->sectors_per_block_shift = -1;
+ else
+ pool->sectors_per_block_shift = __ffs(block_size);
+ pool->low_water_blocks = 0;
+ pool_features_init(&pool->pf);
+ pool->prison = dm_bio_prison_create();
+ if (!pool->prison) {
+ *error = "Error creating pool's bio prison";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_prison;
+ }
+
+ pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+ if (IS_ERR(pool->copier)) {
+ r = PTR_ERR(pool->copier);
+ *error = "Error creating pool's kcopyd client";
+ err_p = ERR_PTR(r);
+ goto bad_kcopyd_client;
+ }
+
+ /*
+ * Create singlethreaded workqueue that will service all devices
+ * that use this metadata.
+ */
+ pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+ if (!pool->wq) {
+ *error = "Error creating pool's workqueue";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_wq;
+ }
+
+ throttle_init(&pool->throttle);
+ INIT_WORK(&pool->worker, do_worker);
+ INIT_DELAYED_WORK(&pool->waker, do_waker);
+ INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
+ spin_lock_init(&pool->lock);
+ bio_list_init(&pool->deferred_flush_bios);
+ INIT_LIST_HEAD(&pool->prepared_mappings);
+ INIT_LIST_HEAD(&pool->prepared_discards);
+ INIT_LIST_HEAD(&pool->active_thins);
+ pool->low_water_triggered = false;
+ pool->suspended = true;
+
+ pool->shared_read_ds = dm_deferred_set_create();
+ if (!pool->shared_read_ds) {
+ *error = "Error creating pool's shared read deferred set";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_shared_read_ds;
+ }
+
+ pool->all_io_ds = dm_deferred_set_create();
+ if (!pool->all_io_ds) {
+ *error = "Error creating pool's all io deferred set";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_all_io_ds;
+ }
+
+ pool->next_mapping = NULL;
+ pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
+ _new_mapping_cache);
+ if (!pool->mapping_pool) {
+ *error = "Error creating pool's mapping mempool";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_mapping_pool;
+ }
+
+ pool->cell_sort_array = vmalloc(sizeof(*pool->cell_sort_array) * CELL_SORT_ARRAY_SIZE);
+ if (!pool->cell_sort_array) {
+ *error = "Error allocating cell sort array";
+ err_p = ERR_PTR(-ENOMEM);
+ goto bad_sort_array;
+ }
+
+ pool->ref_count = 1;
+ pool->last_commit_jiffies = jiffies;
+ pool->pool_md = pool_md;
+ pool->md_dev = metadata_dev;
+ __pool_table_insert(pool);
+
+ return pool;
+
+bad_sort_array:
+ mempool_destroy(pool->mapping_pool);
+bad_mapping_pool:
+ dm_deferred_set_destroy(pool->all_io_ds);
+bad_all_io_ds:
+ dm_deferred_set_destroy(pool->shared_read_ds);
+bad_shared_read_ds:
+ destroy_workqueue(pool->wq);
+bad_wq:
+ dm_kcopyd_client_destroy(pool->copier);
+bad_kcopyd_client:
+ dm_bio_prison_destroy(pool->prison);
+bad_prison:
+ kfree(pool);
+bad_pool:
+ if (dm_pool_metadata_close(pmd))
+ DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
+
+ return err_p;
+}
+
+static void __pool_inc(struct pool *pool)
+{
+ BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+ pool->ref_count++;
+}
+
+static void __pool_dec(struct pool *pool)
+{
+ BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+ BUG_ON(!pool->ref_count);
+ if (!--pool->ref_count)
+ __pool_destroy(pool);
+}
+
+static struct pool *__pool_find(struct mapped_device *pool_md,
+ struct block_device *metadata_dev,
+ unsigned long block_size, int read_only,
+ char **error, int *created)
+{
+ struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
+
+ if (pool) {
+ if (pool->pool_md != pool_md) {
+ *error = "metadata device already in use by a pool";
+ return ERR_PTR(-EBUSY);
+ }
+ __pool_inc(pool);
+
+ } else {
+ pool = __pool_table_lookup(pool_md);
+ if (pool) {
+ if (pool->md_dev != metadata_dev) {
+ *error = "different pool cannot replace a pool";
+ return ERR_PTR(-EINVAL);
+ }
+ __pool_inc(pool);
+
+ } else {
+ pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
+ *created = 1;
+ }
+ }
+
+ return pool;
+}
+
+/*----------------------------------------------------------------
+ * Pool target methods
+ *--------------------------------------------------------------*/
+static void pool_dtr(struct dm_target *ti)
+{
+ struct pool_c *pt = ti->private;
+
+ mutex_lock(&dm_thin_pool_table.mutex);
+
+ unbind_control_target(pt->pool, ti);
+ __pool_dec(pt->pool);
+ dm_put_device(ti, pt->metadata_dev);
+ dm_put_device(ti, pt->data_dev);
+ kfree(pt);
+
+ mutex_unlock(&dm_thin_pool_table.mutex);
+}
+
+static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
+ struct dm_target *ti)
+{
+ int r;
+ unsigned argc;
+ const char *arg_name;
+
+ static struct dm_arg _args[] = {
+ {0, 4, "Invalid number of pool feature arguments"},
+ };
+
+ /*
+ * No feature arguments supplied.
+ */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ while (argc && !r) {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ if (!strcasecmp(arg_name, "skip_block_zeroing"))
+ pf->zero_new_blocks = false;
+
+ else if (!strcasecmp(arg_name, "ignore_discard"))
+ pf->discard_enabled = false;
+
+ else if (!strcasecmp(arg_name, "no_discard_passdown"))
+ pf->discard_passdown = false;
+
+ else if (!strcasecmp(arg_name, "read_only"))
+ pf->mode = PM_READ_ONLY;
+
+ else if (!strcasecmp(arg_name, "error_if_no_space"))
+ pf->error_if_no_space = true;
+
+ else {
+ ti->error = "Unrecognised pool feature requested";
+ r = -EINVAL;
+ break;
+ }
+ }
+
+ return r;
+}
+
+static void metadata_low_callback(void *context)
+{
+ struct pool *pool = context;
+
+ DMWARN("%s: reached low water mark for metadata device: sending event.",
+ dm_device_name(pool->pool_md));
+
+ dm_table_event(pool->ti->table);
+}
+
+static sector_t get_dev_size(struct block_device *bdev)
+{
+ return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static void warn_if_metadata_device_too_big(struct block_device *bdev)
+{
+ sector_t metadata_dev_size = get_dev_size(bdev);
+ char buffer[BDEVNAME_SIZE];
+
+ if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
+ DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+ bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
+}
+
+static sector_t get_metadata_dev_size(struct block_device *bdev)
+{
+ sector_t metadata_dev_size = get_dev_size(bdev);
+
+ if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
+ metadata_dev_size = THIN_METADATA_MAX_SECTORS;
+
+ return metadata_dev_size;
+}
+
+static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
+{
+ sector_t metadata_dev_size = get_metadata_dev_size(bdev);
+
+ sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
+
+ return metadata_dev_size;
+}
+
+/*
+ * When a metadata threshold is crossed a dm event is triggered, and
+ * userland should respond by growing the metadata device. We could let
+ * userland set the threshold, like we do with the data threshold, but I'm
+ * not sure they know enough to do this well.
+ */
+static dm_block_t calc_metadata_threshold(struct pool_c *pt)
+{
+ /*
+ * 4M is ample for all ops with the possible exception of thin
+ * device deletion which is harmless if it fails (just retry the
+ * delete after you've grown the device).
+ */
+ dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
+ return min((dm_block_t)1024ULL /* 4M */, quarter);
+}
+
+/*
+ * thin-pool <metadata dev> <data dev>
+ * <data block size (sectors)>
+ * <low water mark (blocks)>
+ * [<#feature args> [<arg>]*]
+ *
+ * Optional feature arguments are:
+ * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
+ * ignore_discard: disable discard
+ * no_discard_passdown: don't pass discards down to the data device
+ * read_only: Don't allow any changes to be made to the pool metadata.
+ * error_if_no_space: error IOs, instead of queueing, if no space.
+ */
+static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r, pool_created = 0;
+ struct pool_c *pt;
+ struct pool *pool;
+ struct pool_features pf;
+ struct dm_arg_set as;
+ struct dm_dev *data_dev;
+ unsigned long block_size;
+ dm_block_t low_water_blocks;
+ struct dm_dev *metadata_dev;
+ fmode_t metadata_mode;
+
+ /*
+ * FIXME Remove validation from scope of lock.
+ */
+ mutex_lock(&dm_thin_pool_table.mutex);
+
+ if (argc < 4) {
+ ti->error = "Invalid argument count";
+ r = -EINVAL;
+ goto out_unlock;
+ }
+
+ as.argc = argc;
+ as.argv = argv;
+
+ /*
+ * Set default pool features.
+ */
+ pool_features_init(&pf);
+
+ dm_consume_args(&as, 4);
+ r = parse_pool_features(&as, &pf, ti);
+ if (r)
+ goto out_unlock;
+
+ metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
+ r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
+ if (r) {
+ ti->error = "Error opening metadata block device";
+ goto out_unlock;
+ }
+ warn_if_metadata_device_too_big(metadata_dev->bdev);
+
+ r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
+ if (r) {
+ ti->error = "Error getting data device";
+ goto out_metadata;
+ }
+
+ if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
+ block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
+ block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
+ block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
+ ti->error = "Invalid block size";
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
+ ti->error = "Invalid low water mark";
+ r = -EINVAL;
+ goto out;
+ }
+
+ pt = kzalloc(sizeof(*pt), GFP_KERNEL);
+ if (!pt) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
+ block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
+ if (IS_ERR(pool)) {
+ r = PTR_ERR(pool);
+ goto out_free_pt;
+ }
+
+ /*
+ * 'pool_created' reflects whether this is the first table load.
+ * Top level discard support is not allowed to be changed after
+ * initial load. This would require a pool reload to trigger thin
+ * device changes.
+ */
+ if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
+ ti->error = "Discard support cannot be disabled once enabled";
+ r = -EINVAL;
+ goto out_flags_changed;
+ }
+
+ pt->pool = pool;
+ pt->ti = ti;
+ pt->metadata_dev = metadata_dev;
+ pt->data_dev = data_dev;
+ pt->low_water_blocks = low_water_blocks;
+ pt->adjusted_pf = pt->requested_pf = pf;
+ ti->num_flush_bios = 1;
+
+ /*
+ * Only need to enable discards if the pool should pass
+ * them down to the data device. The thin device's discard
+ * processing will cause mappings to be removed from the btree.
+ */
+ ti->discard_zeroes_data_unsupported = true;
+ if (pf.discard_enabled && pf.discard_passdown) {
+ ti->num_discard_bios = 1;
+
+ /*
+ * Setting 'discards_supported' circumvents the normal
+ * stacking of discard limits (this keeps the pool and
+ * thin devices' discard limits consistent).
+ */
+ ti->discards_supported = true;
+ }
+ ti->private = pt;
+
+ r = dm_pool_register_metadata_threshold(pt->pool->pmd,
+ calc_metadata_threshold(pt),
+ metadata_low_callback,
+ pool);
+ if (r)
+ goto out_free_pt;
+
+ pt->callbacks.congested_fn = pool_is_congested;
+ dm_table_add_target_callbacks(ti->table, &pt->callbacks);
+
+ mutex_unlock(&dm_thin_pool_table.mutex);
+
+ return 0;
+
+out_flags_changed:
+ __pool_dec(pool);
+out_free_pt:
+ kfree(pt);
+out:
+ dm_put_device(ti, data_dev);
+out_metadata:
+ dm_put_device(ti, metadata_dev);
+out_unlock:
+ mutex_unlock(&dm_thin_pool_table.mutex);
+
+ return r;
+}
+
+static int pool_map(struct dm_target *ti, struct bio *bio)
+{
+ int r;
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ unsigned long flags;
+
+ /*
+ * As this is a singleton target, ti->begin is always zero.
+ */
+ spin_lock_irqsave(&pool->lock, flags);
+ bio->bi_bdev = pt->data_dev->bdev;
+ r = DM_MAPIO_REMAPPED;
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ return r;
+}
+
+static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
+{
+ int r;
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ sector_t data_size = ti->len;
+ dm_block_t sb_data_size;
+
+ *need_commit = false;
+
+ (void) sector_div(data_size, pool->sectors_per_block);
+
+ r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
+ if (r) {
+ DMERR("%s: failed to retrieve data device size",
+ dm_device_name(pool->pool_md));
+ return r;
+ }
+
+ if (data_size < sb_data_size) {
+ DMERR("%s: pool target (%llu blocks) too small: expected %llu",
+ dm_device_name(pool->pool_md),
+ (unsigned long long)data_size, sb_data_size);
+ return -EINVAL;
+
+ } else if (data_size > sb_data_size) {
+ if (dm_pool_metadata_needs_check(pool->pmd)) {
+ DMERR("%s: unable to grow the data device until repaired.",
+ dm_device_name(pool->pool_md));
+ return 0;
+ }
+
+ if (sb_data_size)
+ DMINFO("%s: growing the data device from %llu to %llu blocks",
+ dm_device_name(pool->pool_md),
+ sb_data_size, (unsigned long long)data_size);
+ r = dm_pool_resize_data_dev(pool->pmd, data_size);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
+ return r;
+ }
+
+ *need_commit = true;
+ }
+
+ return 0;
+}
+
+static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
+{
+ int r;
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ dm_block_t metadata_dev_size, sb_metadata_dev_size;
+
+ *need_commit = false;
+
+ metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
+
+ r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
+ if (r) {
+ DMERR("%s: failed to retrieve metadata device size",
+ dm_device_name(pool->pool_md));
+ return r;
+ }
+
+ if (metadata_dev_size < sb_metadata_dev_size) {
+ DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
+ dm_device_name(pool->pool_md),
+ metadata_dev_size, sb_metadata_dev_size);
+ return -EINVAL;
+
+ } else if (metadata_dev_size > sb_metadata_dev_size) {
+ if (dm_pool_metadata_needs_check(pool->pmd)) {
+ DMERR("%s: unable to grow the metadata device until repaired.",
+ dm_device_name(pool->pool_md));
+ return 0;
+ }
+
+ warn_if_metadata_device_too_big(pool->md_dev);
+ DMINFO("%s: growing the metadata device from %llu to %llu blocks",
+ dm_device_name(pool->pool_md),
+ sb_metadata_dev_size, metadata_dev_size);
+ r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
+ return r;
+ }
+
+ *need_commit = true;
+ }
+
+ return 0;
+}
+
+/*
+ * Retrieves the number of blocks of the data device from
+ * the superblock and compares it to the actual device size,
+ * thus resizing the data device in case it has grown.
+ *
+ * This both copes with opening preallocated data devices in the ctr
+ * being followed by a resume
+ * -and-
+ * calling the resume method individually after userspace has
+ * grown the data device in reaction to a table event.
+ */
+static int pool_preresume(struct dm_target *ti)
+{
+ int r;
+ bool need_commit1, need_commit2;
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+
+ /*
+ * Take control of the pool object.
+ */
+ r = bind_control_target(pool, ti);
+ if (r)
+ return r;
+
+ r = maybe_resize_data_dev(ti, &need_commit1);
+ if (r)
+ return r;
+
+ r = maybe_resize_metadata_dev(ti, &need_commit2);
+ if (r)
+ return r;
+
+ if (need_commit1 || need_commit2)
+ (void) commit(pool);
+
+ return 0;
+}
+
+static void pool_suspend_active_thins(struct pool *pool)
+{
+ struct thin_c *tc;
+
+ /* Suspend all active thin devices */
+ tc = get_first_thin(pool);
+ while (tc) {
+ dm_internal_suspend_noflush(tc->thin_md);
+ tc = get_next_thin(pool, tc);
+ }
+}
+
+static void pool_resume_active_thins(struct pool *pool)
+{
+ struct thin_c *tc;
+
+ /* Resume all active thin devices */
+ tc = get_first_thin(pool);
+ while (tc) {
+ dm_internal_resume(tc->thin_md);
+ tc = get_next_thin(pool, tc);
+ }
+}
+
+static void pool_resume(struct dm_target *ti)
+{
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ unsigned long flags;
+
+ /*
+ * Must requeue active_thins' bios and then resume
+ * active_thins _before_ clearing 'suspend' flag.
+ */
+ requeue_bios(pool);
+ pool_resume_active_thins(pool);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ pool->low_water_triggered = false;
+ pool->suspended = false;
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ do_waker(&pool->waker.work);
+}
+
+static void pool_presuspend(struct dm_target *ti)
+{
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ pool->suspended = true;
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ pool_suspend_active_thins(pool);
+}
+
+static void pool_presuspend_undo(struct dm_target *ti)
+{
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ unsigned long flags;
+
+ pool_resume_active_thins(pool);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ pool->suspended = false;
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static void pool_postsuspend(struct dm_target *ti)
+{
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+
+ cancel_delayed_work(&pool->waker);
+ cancel_delayed_work(&pool->no_space_timeout);
+ flush_workqueue(pool->wq);
+ (void) commit(pool);
+}
+
+static int check_arg_count(unsigned argc, unsigned args_required)
+{
+ if (argc != args_required) {
+ DMWARN("Message received with %u arguments instead of %u.",
+ argc, args_required);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
+{
+ if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
+ *dev_id <= MAX_DEV_ID)
+ return 0;
+
+ if (warning)
+ DMWARN("Message received with invalid device id: %s", arg);
+
+ return -EINVAL;
+}
+
+static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+ dm_thin_id dev_id;
+ int r;
+
+ r = check_arg_count(argc, 2);
+ if (r)
+ return r;
+
+ r = read_dev_id(argv[1], &dev_id, 1);
+ if (r)
+ return r;
+
+ r = dm_pool_create_thin(pool->pmd, dev_id);
+ if (r) {
+ DMWARN("Creation of new thinly-provisioned device with id %s failed.",
+ argv[1]);
+ return r;
+ }
+
+ return 0;
+}
+
+static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+ dm_thin_id dev_id;
+ dm_thin_id origin_dev_id;
+ int r;
+
+ r = check_arg_count(argc, 3);
+ if (r)
+ return r;
+
+ r = read_dev_id(argv[1], &dev_id, 1);
+ if (r)
+ return r;
+
+ r = read_dev_id(argv[2], &origin_dev_id, 1);
+ if (r)
+ return r;
+
+ r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
+ if (r) {
+ DMWARN("Creation of new snapshot %s of device %s failed.",
+ argv[1], argv[2]);
+ return r;
+ }
+
+ return 0;
+}
+
+static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+ dm_thin_id dev_id;
+ int r;
+
+ r = check_arg_count(argc, 2);
+ if (r)
+ return r;
+
+ r = read_dev_id(argv[1], &dev_id, 1);
+ if (r)
+ return r;
+
+ r = dm_pool_delete_thin_device(pool->pmd, dev_id);
+ if (r)
+ DMWARN("Deletion of thin device %s failed.", argv[1]);
+
+ return r;
+}
+
+static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+ dm_thin_id old_id, new_id;
+ int r;
+
+ r = check_arg_count(argc, 3);
+ if (r)
+ return r;
+
+ if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
+ DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
+ return -EINVAL;
+ }
+
+ if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
+ DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
+ return -EINVAL;
+ }
+
+ r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
+ if (r) {
+ DMWARN("Failed to change transaction id from %s to %s.",
+ argv[1], argv[2]);
+ return r;
+ }
+
+ return 0;
+}
+
+static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+ int r;
+
+ r = check_arg_count(argc, 1);
+ if (r)
+ return r;
+
+ (void) commit(pool);
+
+ r = dm_pool_reserve_metadata_snap(pool->pmd);
+ if (r)
+ DMWARN("reserve_metadata_snap message failed.");
+
+ return r;
+}
+
+static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
+{
+ int r;
+
+ r = check_arg_count(argc, 1);
+ if (r)
+ return r;
+
+ r = dm_pool_release_metadata_snap(pool->pmd);
+ if (r)
+ DMWARN("release_metadata_snap message failed.");
+
+ return r;
+}
+
+/*
+ * Messages supported:
+ * create_thin <dev_id>
+ * create_snap <dev_id> <origin_id>
+ * delete <dev_id>
+ * set_transaction_id <current_trans_id> <new_trans_id>
+ * reserve_metadata_snap
+ * release_metadata_snap
+ */
+static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r = -EINVAL;
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+
+ if (get_pool_mode(pool) >= PM_READ_ONLY) {
+ DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
+ dm_device_name(pool->pool_md));
+ return -EINVAL;
+ }
+
+ if (!strcasecmp(argv[0], "create_thin"))
+ r = process_create_thin_mesg(argc, argv, pool);
+
+ else if (!strcasecmp(argv[0], "create_snap"))
+ r = process_create_snap_mesg(argc, argv, pool);
+
+ else if (!strcasecmp(argv[0], "delete"))
+ r = process_delete_mesg(argc, argv, pool);
+
+ else if (!strcasecmp(argv[0], "set_transaction_id"))
+ r = process_set_transaction_id_mesg(argc, argv, pool);
+
+ else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
+ r = process_reserve_metadata_snap_mesg(argc, argv, pool);
+
+ else if (!strcasecmp(argv[0], "release_metadata_snap"))
+ r = process_release_metadata_snap_mesg(argc, argv, pool);
+
+ else
+ DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
+
+ if (!r)
+ (void) commit(pool);
+
+ return r;
+}
+
+static void emit_flags(struct pool_features *pf, char *result,
+ unsigned sz, unsigned maxlen)
+{
+ unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
+ !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
+ pf->error_if_no_space;
+ DMEMIT("%u ", count);
+
+ if (!pf->zero_new_blocks)
+ DMEMIT("skip_block_zeroing ");
+
+ if (!pf->discard_enabled)
+ DMEMIT("ignore_discard ");
+
+ if (!pf->discard_passdown)
+ DMEMIT("no_discard_passdown ");
+
+ if (pf->mode == PM_READ_ONLY)
+ DMEMIT("read_only ");
+
+ if (pf->error_if_no_space)
+ DMEMIT("error_if_no_space ");
+}
+
+/*
+ * Status line is:
+ * <transaction id> <used metadata sectors>/<total metadata sectors>
+ * <used data sectors>/<total data sectors> <held metadata root>
+ */
+static void pool_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ int r;
+ unsigned sz = 0;
+ uint64_t transaction_id;
+ dm_block_t nr_free_blocks_data;
+ dm_block_t nr_free_blocks_metadata;
+ dm_block_t nr_blocks_data;
+ dm_block_t nr_blocks_metadata;
+ dm_block_t held_root;
+ char buf[BDEVNAME_SIZE];
+ char buf2[BDEVNAME_SIZE];
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ if (get_pool_mode(pool) == PM_FAIL) {
+ DMEMIT("Fail");
+ break;
+ }
+
+ /* Commit to ensure statistics aren't out-of-date */
+ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+ (void) commit(pool);
+
+ r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
+ if (r) {
+ DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
+ dm_device_name(pool->pool_md), r);
+ goto err;
+ }
+
+ r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
+ if (r) {
+ DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
+ dm_device_name(pool->pool_md), r);
+ goto err;
+ }
+
+ r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
+ if (r) {
+ DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
+ dm_device_name(pool->pool_md), r);
+ goto err;
+ }
+
+ r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
+ if (r) {
+ DMERR("%s: dm_pool_get_free_block_count returned %d",
+ dm_device_name(pool->pool_md), r);
+ goto err;
+ }
+
+ r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
+ if (r) {
+ DMERR("%s: dm_pool_get_data_dev_size returned %d",
+ dm_device_name(pool->pool_md), r);
+ goto err;
+ }
+
+ r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
+ if (r) {
+ DMERR("%s: dm_pool_get_metadata_snap returned %d",
+ dm_device_name(pool->pool_md), r);
+ goto err;
+ }
+
+ DMEMIT("%llu %llu/%llu %llu/%llu ",
+ (unsigned long long)transaction_id,
+ (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
+ (unsigned long long)nr_blocks_metadata,
+ (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
+ (unsigned long long)nr_blocks_data);
+
+ if (held_root)
+ DMEMIT("%llu ", held_root);
+ else
+ DMEMIT("- ");
+
+ if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+ DMEMIT("out_of_data_space ");
+ else if (pool->pf.mode == PM_READ_ONLY)
+ DMEMIT("ro ");
+ else
+ DMEMIT("rw ");
+
+ if (!pool->pf.discard_enabled)
+ DMEMIT("ignore_discard ");
+ else if (pool->pf.discard_passdown)
+ DMEMIT("discard_passdown ");
+ else
+ DMEMIT("no_discard_passdown ");
+
+ if (pool->pf.error_if_no_space)
+ DMEMIT("error_if_no_space ");
+ else
+ DMEMIT("queue_if_no_space ");
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %s %lu %llu ",
+ format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
+ format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
+ (unsigned long)pool->sectors_per_block,
+ (unsigned long long)pt->low_water_blocks);
+ emit_flags(&pt->requested_pf, result, sz, maxlen);
+ break;
+ }
+ return;
+
+err:
+ DMEMIT("Error");
+}
+
+static int pool_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct pool_c *pt = ti->private;
+
+ return fn(ti, pt->data_dev, 0, ti->len, data);
+}
+
+static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct pool_c *pt = ti->private;
+ struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = pt->data_dev->bdev;
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
+{
+ struct pool *pool = pt->pool;
+ struct queue_limits *data_limits;
+
+ limits->max_discard_sectors = pool->sectors_per_block;
+
+ /*
+ * discard_granularity is just a hint, and not enforced.
+ */
+ if (pt->adjusted_pf.discard_passdown) {
+ data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
+ limits->discard_granularity = max(data_limits->discard_granularity,
+ pool->sectors_per_block << SECTOR_SHIFT);
+ } else
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+}
+
+static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+ /*
+ * If max_sectors is smaller than pool->sectors_per_block adjust it
+ * to the highest possible power-of-2 factor of pool->sectors_per_block.
+ * This is especially beneficial when the pool's data device is a RAID
+ * device that has a full stripe width that matches pool->sectors_per_block
+ * -- because even though partial RAID stripe-sized IOs will be issued to a
+ * single RAID stripe; when aggregated they will end on a full RAID stripe
+ * boundary.. which avoids additional partial RAID stripe writes cascading
+ */
+ if (limits->max_sectors < pool->sectors_per_block) {
+ while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
+ if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
+ limits->max_sectors--;
+ limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
+ }
+ }
+
+ /*
+ * If the system-determined stacked limits are compatible with the
+ * pool's blocksize (io_opt is a factor) do not override them.
+ */
+ if (io_opt_sectors < pool->sectors_per_block ||
+ !is_factor(io_opt_sectors, pool->sectors_per_block)) {
+ if (is_factor(pool->sectors_per_block, limits->max_sectors))
+ blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
+ else
+ blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ }
+
+ /*
+ * pt->adjusted_pf is a staging area for the actual features to use.
+ * They get transferred to the live pool in bind_control_target()
+ * called from pool_preresume().
+ */
+ if (!pt->adjusted_pf.discard_enabled) {
+ /*
+ * Must explicitly disallow stacking discard limits otherwise the
+ * block layer will stack them if pool's data device has support.
+ * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
+ * user to see that, so make sure to set all discard limits to 0.
+ */
+ limits->discard_granularity = 0;
+ return;
+ }
+
+ disable_passdown_if_not_supported(pt);
+
+ set_discard_limits(pt, limits);
+}
+
+static struct target_type pool_target = {
+ .name = "thin-pool",
+ .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
+ DM_TARGET_IMMUTABLE,
+ .version = {1, 14, 0},
+ .module = THIS_MODULE,
+ .ctr = pool_ctr,
+ .dtr = pool_dtr,
+ .map = pool_map,
+ .presuspend = pool_presuspend,
+ .presuspend_undo = pool_presuspend_undo,
+ .postsuspend = pool_postsuspend,
+ .preresume = pool_preresume,
+ .resume = pool_resume,
+ .message = pool_message,
+ .status = pool_status,
+ .merge = pool_merge,
+ .iterate_devices = pool_iterate_devices,
+ .io_hints = pool_io_hints,
+};
+
+/*----------------------------------------------------------------
+ * Thin target methods
+ *--------------------------------------------------------------*/
+static void thin_get(struct thin_c *tc)
+{
+ atomic_inc(&tc->refcount);
+}
+
+static void thin_put(struct thin_c *tc)
+{
+ if (atomic_dec_and_test(&tc->refcount))
+ complete(&tc->can_destroy);
+}
+
+static void thin_dtr(struct dm_target *ti)
+{
+ struct thin_c *tc = ti->private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tc->pool->lock, flags);
+ list_del_rcu(&tc->list);
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
+ synchronize_rcu();
+
+ thin_put(tc);
+ wait_for_completion(&tc->can_destroy);
+
+ mutex_lock(&dm_thin_pool_table.mutex);
+
+ __pool_dec(tc->pool);
+ dm_pool_close_thin_device(tc->td);
+ dm_put_device(ti, tc->pool_dev);
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
+ kfree(tc);
+
+ mutex_unlock(&dm_thin_pool_table.mutex);
+}
+
+/*
+ * Thin target parameters:
+ *
+ * <pool_dev> <dev_id> [origin_dev]
+ *
+ * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
+ * dev_id: the internal device identifier
+ * origin_dev: a device external to the pool that should act as the origin
+ *
+ * If the pool device has discards disabled, they get disabled for the thin
+ * device as well.
+ */
+static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r;
+ struct thin_c *tc;
+ struct dm_dev *pool_dev, *origin_dev;
+ struct mapped_device *pool_md;
+ unsigned long flags;
+
+ mutex_lock(&dm_thin_pool_table.mutex);
+
+ if (argc != 2 && argc != 3) {
+ ti->error = "Invalid argument count";
+ r = -EINVAL;
+ goto out_unlock;
+ }
+
+ tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
+ if (!tc) {
+ ti->error = "Out of memory";
+ r = -ENOMEM;
+ goto out_unlock;
+ }
+ tc->thin_md = dm_table_get_md(ti->table);
+ spin_lock_init(&tc->lock);
+ INIT_LIST_HEAD(&tc->deferred_cells);
+ bio_list_init(&tc->deferred_bio_list);
+ bio_list_init(&tc->retry_on_resume_list);
+ tc->sort_bio_list = RB_ROOT;
+
+ if (argc == 3) {
+ r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+ if (r) {
+ ti->error = "Error opening origin device";
+ goto bad_origin_dev;
+ }
+ tc->origin_dev = origin_dev;
+ }
+
+ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
+ if (r) {
+ ti->error = "Error opening pool device";
+ goto bad_pool_dev;
+ }
+ tc->pool_dev = pool_dev;
+
+ if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
+ ti->error = "Invalid device id";
+ r = -EINVAL;
+ goto bad_common;
+ }
+
+ pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
+ if (!pool_md) {
+ ti->error = "Couldn't get pool mapped device";
+ r = -EINVAL;
+ goto bad_common;
+ }
+
+ tc->pool = __pool_table_lookup(pool_md);
+ if (!tc->pool) {
+ ti->error = "Couldn't find pool object";
+ r = -EINVAL;
+ goto bad_pool_lookup;
+ }
+ __pool_inc(tc->pool);
+
+ if (get_pool_mode(tc->pool) == PM_FAIL) {
+ ti->error = "Couldn't open thin device, Pool is in fail mode";
+ r = -EINVAL;
+ goto bad_pool;
+ }
+
+ r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
+ if (r) {
+ ti->error = "Couldn't open thin internal device";
+ goto bad_pool;
+ }
+
+ r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
+ if (r)
+ goto bad;
+
+ ti->num_flush_bios = 1;
+ ti->flush_supported = true;
+ ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
+
+ /* In case the pool supports discards, pass them on. */
+ ti->discard_zeroes_data_unsupported = true;
+ if (tc->pool->pf.discard_enabled) {
+ ti->discards_supported = true;
+ ti->num_discard_bios = 1;
+ /* Discard bios must be split on a block boundary */
+ ti->split_discard_bios = true;
+ }
+
+ mutex_unlock(&dm_thin_pool_table.mutex);
+
+ spin_lock_irqsave(&tc->pool->lock, flags);
+ if (tc->pool->suspended) {
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
+ mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
+ ti->error = "Unable to activate thin device while pool is suspended";
+ r = -EINVAL;
+ goto bad;
+ }
+ atomic_set(&tc->refcount, 1);
+ init_completion(&tc->can_destroy);
+ list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
+ spin_unlock_irqrestore(&tc->pool->lock, flags);
+ /*
+ * This synchronize_rcu() call is needed here otherwise we risk a
+ * wake_worker() call finding no bios to process (because the newly
+ * added tc isn't yet visible). So this reduces latency since we
+ * aren't then dependent on the periodic commit to wake_worker().
+ */
+ synchronize_rcu();
+
+ dm_put(pool_md);
+
+ return 0;
+
+bad:
+ dm_pool_close_thin_device(tc->td);
+bad_pool:
+ __pool_dec(tc->pool);
+bad_pool_lookup:
+ dm_put(pool_md);
+bad_common:
+ dm_put_device(ti, tc->pool_dev);
+bad_pool_dev:
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
+bad_origin_dev:
+ kfree(tc);
+out_unlock:
+ mutex_unlock(&dm_thin_pool_table.mutex);
+
+ return r;
+}
+
+static int thin_map(struct dm_target *ti, struct bio *bio)
+{
+ bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ return thin_bio_map(ti, bio);
+}
+
+static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
+{
+ unsigned long flags;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+ struct list_head work;
+ struct dm_thin_new_mapping *m, *tmp;
+ struct pool *pool = h->tc->pool;
+
+ if (h->shared_read_entry) {
+ INIT_LIST_HEAD(&work);
+ dm_deferred_entry_dec(h->shared_read_entry, &work);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry_safe(m, tmp, &work, list) {
+ list_del(&m->list);
+ __complete_mapping_preparation(m);
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ if (h->all_io_entry) {
+ INIT_LIST_HEAD(&work);
+ dm_deferred_entry_dec(h->all_io_entry, &work);
+ if (!list_empty(&work)) {
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry_safe(m, tmp, &work, list)
+ list_add_tail(&m->list, &pool->prepared_discards);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ wake_worker(pool);
+ }
+ }
+
+ return 0;
+}
+
+static void thin_presuspend(struct dm_target *ti)
+{
+ struct thin_c *tc = ti->private;
+
+ if (dm_noflush_suspending(ti))
+ noflush_work(tc, do_noflush_start);
+}
+
+static void thin_postsuspend(struct dm_target *ti)
+{
+ struct thin_c *tc = ti->private;
+
+ /*
+ * The dm_noflush_suspending flag has been cleared by now, so
+ * unfortunately we must always run this.
+ */
+ noflush_work(tc, do_noflush_stop);
+}
+
+static int thin_preresume(struct dm_target *ti)
+{
+ struct thin_c *tc = ti->private;
+
+ if (tc->origin_dev)
+ tc->origin_size = get_dev_size(tc->origin_dev->bdev);
+
+ return 0;
+}
+
+/*
+ * <nr mapped sectors> <highest mapped sector>
+ */
+static void thin_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ int r;
+ ssize_t sz = 0;
+ dm_block_t mapped, highest;
+ char buf[BDEVNAME_SIZE];
+ struct thin_c *tc = ti->private;
+
+ if (get_pool_mode(tc->pool) == PM_FAIL) {
+ DMEMIT("Fail");
+ return;
+ }
+
+ if (!tc->td)
+ DMEMIT("-");
+ else {
+ switch (type) {
+ case STATUSTYPE_INFO:
+ r = dm_thin_get_mapped_count(tc->td, &mapped);
+ if (r) {
+ DMERR("dm_thin_get_mapped_count returned %d", r);
+ goto err;
+ }
+
+ r = dm_thin_get_highest_mapped_block(tc->td, &highest);
+ if (r < 0) {
+ DMERR("dm_thin_get_highest_mapped_block returned %d", r);
+ goto err;
+ }
+
+ DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
+ if (r)
+ DMEMIT("%llu", ((highest + 1) *
+ tc->pool->sectors_per_block) - 1);
+ else
+ DMEMIT("-");
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %lu",
+ format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
+ (unsigned long) tc->dev_id);
+ if (tc->origin_dev)
+ DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
+ break;
+ }
+ }
+
+ return;
+
+err:
+ DMEMIT("Error");
+}
+
+static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct thin_c *tc = ti->private;
+ struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = tc->pool_dev->bdev;
+ bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int thin_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ sector_t blocks;
+ struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
+
+ /*
+ * We can't call dm_pool_get_data_dev_size() since that blocks. So
+ * we follow a more convoluted path through to the pool's target.
+ */
+ if (!pool->ti)
+ return 0; /* nothing is bound */
+
+ blocks = pool->ti->len;
+ (void) sector_div(blocks, pool->sectors_per_block);
+ if (blocks)
+ return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
+
+ return 0;
+}
+
+static struct target_type thin_target = {
+ .name = "thin",
+ .version = {1, 14, 0},
+ .module = THIS_MODULE,
+ .ctr = thin_ctr,
+ .dtr = thin_dtr,
+ .map = thin_map,
+ .end_io = thin_endio,
+ .preresume = thin_preresume,
+ .presuspend = thin_presuspend,
+ .postsuspend = thin_postsuspend,
+ .status = thin_status,
+ .merge = thin_merge,
+ .iterate_devices = thin_iterate_devices,
+};
+
+/*----------------------------------------------------------------*/
+
+static int __init dm_thin_init(void)
+{
+ int r;
+
+ pool_table_init();
+
+ r = dm_register_target(&thin_target);
+ if (r)
+ return r;
+
+ r = dm_register_target(&pool_target);
+ if (r)
+ goto bad_pool_target;
+
+ r = -ENOMEM;
+
+ _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
+ if (!_new_mapping_cache)
+ goto bad_new_mapping_cache;
+
+ return 0;
+
+bad_new_mapping_cache:
+ dm_unregister_target(&pool_target);
+bad_pool_target:
+ dm_unregister_target(&thin_target);
+
+ return r;
+}
+
+static void dm_thin_exit(void)
+{
+ dm_unregister_target(&thin_target);
+ dm_unregister_target(&pool_target);
+
+ kmem_cache_destroy(_new_mapping_cache);
+}
+
+module_init(dm_thin_init);
+module_exit(dm_thin_exit);
+
+module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
+
+MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
new file mode 100644
index 000000000..8efe033ba
--- /dev/null
+++ b/drivers/md/dm-uevent.c
@@ -0,0 +1,219 @@
+/*
+ * Device Mapper Uevent Support (dm-uevent)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2007
+ * Author: Mike Anderson <andmike@linux.vnet.ibm.com>
+ */
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/kobject.h>
+#include <linux/dm-ioctl.h>
+#include <linux/export.h>
+
+#include "dm.h"
+#include "dm-uevent.h"
+
+#define DM_MSG_PREFIX "uevent"
+
+static const struct {
+ enum dm_uevent_type type;
+ enum kobject_action action;
+ char *name;
+} _dm_uevent_type_names[] = {
+ {DM_UEVENT_PATH_FAILED, KOBJ_CHANGE, "PATH_FAILED"},
+ {DM_UEVENT_PATH_REINSTATED, KOBJ_CHANGE, "PATH_REINSTATED"},
+};
+
+static struct kmem_cache *_dm_event_cache;
+
+struct dm_uevent {
+ struct mapped_device *md;
+ enum kobject_action action;
+ struct kobj_uevent_env ku_env;
+ struct list_head elist;
+ char name[DM_NAME_LEN];
+ char uuid[DM_UUID_LEN];
+};
+
+static void dm_uevent_free(struct dm_uevent *event)
+{
+ kmem_cache_free(_dm_event_cache, event);
+}
+
+static struct dm_uevent *dm_uevent_alloc(struct mapped_device *md)
+{
+ struct dm_uevent *event;
+
+ event = kmem_cache_zalloc(_dm_event_cache, GFP_ATOMIC);
+ if (!event)
+ return NULL;
+
+ INIT_LIST_HEAD(&event->elist);
+ event->md = md;
+
+ return event;
+}
+
+static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md,
+ struct dm_target *ti,
+ enum kobject_action action,
+ const char *dm_action,
+ const char *path,
+ unsigned nr_valid_paths)
+{
+ struct dm_uevent *event;
+
+ event = dm_uevent_alloc(md);
+ if (!event) {
+ DMERR("%s: dm_uevent_alloc() failed", __func__);
+ goto err_nomem;
+ }
+
+ event->action = action;
+
+ if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) {
+ DMERR("%s: add_uevent_var() for DM_TARGET failed",
+ __func__);
+ goto err_add;
+ }
+
+ if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) {
+ DMERR("%s: add_uevent_var() for DM_ACTION failed",
+ __func__);
+ goto err_add;
+ }
+
+ if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u",
+ dm_next_uevent_seq(md))) {
+ DMERR("%s: add_uevent_var() for DM_SEQNUM failed",
+ __func__);
+ goto err_add;
+ }
+
+ if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) {
+ DMERR("%s: add_uevent_var() for DM_PATH failed", __func__);
+ goto err_add;
+ }
+
+ if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d",
+ nr_valid_paths)) {
+ DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed",
+ __func__);
+ goto err_add;
+ }
+
+ return event;
+
+err_add:
+ dm_uevent_free(event);
+err_nomem:
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * dm_send_uevents - send uevents for given list
+ *
+ * @events: list of events to send
+ * @kobj: kobject generating event
+ *
+ */
+void dm_send_uevents(struct list_head *events, struct kobject *kobj)
+{
+ int r;
+ struct dm_uevent *event, *next;
+
+ list_for_each_entry_safe(event, next, events, elist) {
+ list_del_init(&event->elist);
+
+ /*
+ * When a device is being removed this copy fails and we
+ * discard these unsent events.
+ */
+ if (dm_copy_name_and_uuid(event->md, event->name,
+ event->uuid)) {
+ DMINFO("%s: skipping sending uevent for lost device",
+ __func__);
+ goto uevent_free;
+ }
+
+ if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) {
+ DMERR("%s: add_uevent_var() for DM_NAME failed",
+ __func__);
+ goto uevent_free;
+ }
+
+ if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) {
+ DMERR("%s: add_uevent_var() for DM_UUID failed",
+ __func__);
+ goto uevent_free;
+ }
+
+ r = kobject_uevent_env(kobj, event->action, event->ku_env.envp);
+ if (r)
+ DMERR("%s: kobject_uevent_env failed", __func__);
+uevent_free:
+ dm_uevent_free(event);
+ }
+}
+EXPORT_SYMBOL_GPL(dm_send_uevents);
+
+/**
+ * dm_path_uevent - called to create a new path event and queue it
+ *
+ * @event_type: path event type enum
+ * @ti: pointer to a dm_target
+ * @path: string containing pathname
+ * @nr_valid_paths: number of valid paths remaining
+ *
+ */
+void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
+ const char *path, unsigned nr_valid_paths)
+{
+ struct mapped_device *md = dm_table_get_md(ti->table);
+ struct dm_uevent *event;
+
+ if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) {
+ DMERR("%s: Invalid event_type %d", __func__, event_type);
+ return;
+ }
+
+ event = dm_build_path_uevent(md, ti,
+ _dm_uevent_type_names[event_type].action,
+ _dm_uevent_type_names[event_type].name,
+ path, nr_valid_paths);
+ if (IS_ERR(event))
+ return;
+
+ dm_uevent_add(md, &event->elist);
+}
+EXPORT_SYMBOL_GPL(dm_path_uevent);
+
+int dm_uevent_init(void)
+{
+ _dm_event_cache = KMEM_CACHE(dm_uevent, 0);
+ if (!_dm_event_cache)
+ return -ENOMEM;
+
+ DMINFO("version 1.0.3");
+
+ return 0;
+}
+
+void dm_uevent_exit(void)
+{
+ kmem_cache_destroy(_dm_event_cache);
+}
diff --git a/drivers/md/dm-uevent.h b/drivers/md/dm-uevent.h
new file mode 100644
index 000000000..2eccc8bd6
--- /dev/null
+++ b/drivers/md/dm-uevent.h
@@ -0,0 +1,59 @@
+/*
+ * Device Mapper Uevent Support
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2007
+ * Author: Mike Anderson <andmike@linux.vnet.ibm.com>
+ */
+#ifndef DM_UEVENT_H
+#define DM_UEVENT_H
+
+enum dm_uevent_type {
+ DM_UEVENT_PATH_FAILED,
+ DM_UEVENT_PATH_REINSTATED,
+};
+
+#ifdef CONFIG_DM_UEVENT
+
+extern int dm_uevent_init(void);
+extern void dm_uevent_exit(void);
+extern void dm_send_uevents(struct list_head *events, struct kobject *kobj);
+extern void dm_path_uevent(enum dm_uevent_type event_type,
+ struct dm_target *ti, const char *path,
+ unsigned nr_valid_paths);
+
+#else
+
+static inline int dm_uevent_init(void)
+{
+ return 0;
+}
+static inline void dm_uevent_exit(void)
+{
+}
+static inline void dm_send_uevents(struct list_head *events,
+ struct kobject *kobj)
+{
+}
+static inline void dm_path_uevent(enum dm_uevent_type event_type,
+ struct dm_target *ti, const char *path,
+ unsigned nr_valid_paths)
+{
+}
+
+#endif /* CONFIG_DM_UEVENT */
+
+#endif /* DM_UEVENT_H */
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
new file mode 100644
index 000000000..66616db33
--- /dev/null
+++ b/drivers/md/dm-verity.c
@@ -0,0 +1,1026 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
+ *
+ * This file is released under the GPLv2.
+ *
+ * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
+ * default prefetch value. Data are read in "prefetch_cluster" chunks from the
+ * hash device. Setting this greatly improves performance when data and hash
+ * are on the same disk on different partitions on devices with poor random
+ * access behavior.
+ */
+
+#include "dm-bufio.h"
+
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+#include <linux/reboot.h>
+#include <crypto/hash.h>
+
+#define DM_MSG_PREFIX "verity"
+
+#define DM_VERITY_ENV_LENGTH 42
+#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR"
+
+#define DM_VERITY_IO_VEC_INLINE 16
+#define DM_VERITY_MEMPOOL_SIZE 4
+#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
+
+#define DM_VERITY_MAX_LEVELS 63
+#define DM_VERITY_MAX_CORRUPTED_ERRS 100
+
+#define DM_VERITY_OPT_LOGGING "ignore_corruption"
+#define DM_VERITY_OPT_RESTART "restart_on_corruption"
+
+static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
+
+module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
+
+enum verity_mode {
+ DM_VERITY_MODE_EIO,
+ DM_VERITY_MODE_LOGGING,
+ DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+ DM_VERITY_BLOCK_TYPE_DATA,
+ DM_VERITY_BLOCK_TYPE_METADATA
+};
+
+struct dm_verity {
+ struct dm_dev *data_dev;
+ struct dm_dev *hash_dev;
+ struct dm_target *ti;
+ struct dm_bufio_client *bufio;
+ char *alg_name;
+ struct crypto_shash *tfm;
+ u8 *root_digest; /* digest of the root block */
+ u8 *salt; /* salt: its size is salt_size */
+ unsigned salt_size;
+ sector_t data_start; /* data offset in 512-byte sectors */
+ sector_t hash_start; /* hash start in blocks */
+ sector_t data_blocks; /* the number of data blocks */
+ sector_t hash_blocks; /* the number of hash blocks */
+ unsigned char data_dev_block_bits; /* log2(data blocksize) */
+ unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
+ unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
+ unsigned char levels; /* the number of tree levels */
+ unsigned char version;
+ unsigned digest_size; /* digest size for the current hash algorithm */
+ unsigned shash_descsize;/* the size of temporary space for crypto */
+ int hash_failed; /* set to 1 if hash of any block failed */
+ enum verity_mode mode; /* mode for handling verification errors */
+ unsigned corrupted_errs;/* Number of errors for corrupted blocks */
+
+ mempool_t *vec_mempool; /* mempool of bio vector */
+
+ struct workqueue_struct *verify_wq;
+
+ /* starting blocks for each tree level. 0 is the lowest level. */
+ sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+};
+
+struct dm_verity_io {
+ struct dm_verity *v;
+
+ /* original values of bio->bi_end_io and bio->bi_private */
+ bio_end_io_t *orig_bi_end_io;
+ void *orig_bi_private;
+
+ sector_t block;
+ unsigned n_blocks;
+
+ struct bvec_iter iter;
+
+ struct work_struct work;
+
+ /*
+ * Three variably-size fields follow this struct:
+ *
+ * u8 hash_desc[v->shash_descsize];
+ * u8 real_digest[v->digest_size];
+ * u8 want_digest[v->digest_size];
+ *
+ * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
+ */
+};
+
+struct dm_verity_prefetch_work {
+ struct work_struct work;
+ struct dm_verity *v;
+ sector_t block;
+ unsigned n_blocks;
+};
+
+static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
+{
+ return (struct shash_desc *)(io + 1);
+}
+
+static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
+{
+ return (u8 *)(io + 1) + v->shash_descsize;
+}
+
+static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
+{
+ return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+}
+
+/*
+ * Auxiliary structure appended to each dm-bufio buffer. If the value
+ * hash_verified is nonzero, hash of the block has been verified.
+ *
+ * The variable hash_verified is set to 0 when allocating the buffer, then
+ * it can be changed to 1 and it is never reset to 0 again.
+ *
+ * There is no lock around this value, a race condition can at worst cause
+ * that multiple processes verify the hash of the same buffer simultaneously
+ * and write 1 to hash_verified simultaneously.
+ * This condition is harmless, so we don't need locking.
+ */
+struct buffer_aux {
+ int hash_verified;
+};
+
+/*
+ * Initialize struct buffer_aux for a freshly created buffer.
+ */
+static void dm_bufio_alloc_callback(struct dm_buffer *buf)
+{
+ struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+
+ aux->hash_verified = 0;
+}
+
+/*
+ * Translate input sector number to the sector number on the target device.
+ */
+static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
+{
+ return v->data_start + dm_target_offset(v->ti, bi_sector);
+}
+
+/*
+ * Return hash position of a specified block at a specified tree level
+ * (0 is the lowest level).
+ * The lowest "hash_per_block_bits"-bits of the result denote hash position
+ * inside a hash block. The remaining bits denote location of the hash block.
+ */
+static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
+ int level)
+{
+ return block >> (level * v->hash_per_block_bits);
+}
+
+static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
+ sector_t *hash_block, unsigned *offset)
+{
+ sector_t position = verity_position_at_level(v, block, level);
+ unsigned idx;
+
+ *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
+
+ if (!offset)
+ return;
+
+ idx = position & ((1 << v->hash_per_block_bits) - 1);
+ if (!v->version)
+ *offset = idx * v->digest_size;
+ else
+ *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
+}
+
+/*
+ * Handle verification errors.
+ */
+static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
+ unsigned long long block)
+{
+ char verity_env[DM_VERITY_ENV_LENGTH];
+ char *envp[] = { verity_env, NULL };
+ const char *type_str = "";
+ struct mapped_device *md = dm_table_get_md(v->ti->table);
+
+ /* Corruption should be visible in device status in all modes */
+ v->hash_failed = 1;
+
+ if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
+ goto out;
+
+ v->corrupted_errs++;
+
+ switch (type) {
+ case DM_VERITY_BLOCK_TYPE_DATA:
+ type_str = "data";
+ break;
+ case DM_VERITY_BLOCK_TYPE_METADATA:
+ type_str = "metadata";
+ break;
+ default:
+ BUG();
+ }
+
+ DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
+ block);
+
+ if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
+ DMERR("%s: reached maximum errors", v->data_dev->name);
+
+ snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
+ DM_VERITY_ENV_VAR_NAME, type, block);
+
+ kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
+
+out:
+ if (v->mode == DM_VERITY_MODE_LOGGING)
+ return 0;
+
+ if (v->mode == DM_VERITY_MODE_RESTART)
+ kernel_restart("dm-verity device corrupted");
+
+ return 1;
+}
+
+/*
+ * Verify hash of a metadata block pertaining to the specified data block
+ * ("block" argument) at a specified level ("level" argument).
+ *
+ * On successful return, io_want_digest(v, io) contains the hash value for
+ * a lower tree level or for the data block (if we're at the lowest leve).
+ *
+ * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
+ * If "skip_unverified" is false, unverified buffer is hashed and verified
+ * against current value of io_want_digest(v, io).
+ */
+static int verity_verify_level(struct dm_verity_io *io, sector_t block,
+ int level, bool skip_unverified)
+{
+ struct dm_verity *v = io->v;
+ struct dm_buffer *buf;
+ struct buffer_aux *aux;
+ u8 *data;
+ int r;
+ sector_t hash_block;
+ unsigned offset;
+
+ verity_hash_at_level(v, block, level, &hash_block, &offset);
+
+ data = dm_bufio_read(v->bufio, hash_block, &buf);
+ if (unlikely(IS_ERR(data)))
+ return PTR_ERR(data);
+
+ aux = dm_bufio_get_aux_data(buf);
+
+ if (!aux->hash_verified) {
+ struct shash_desc *desc;
+ u8 *result;
+
+ if (skip_unverified) {
+ r = 1;
+ goto release_ret_r;
+ }
+
+ desc = io_hash_desc(v, io);
+ desc->tfm = v->tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ r = crypto_shash_init(desc);
+ if (r < 0) {
+ DMERR("crypto_shash_init failed: %d", r);
+ goto release_ret_r;
+ }
+
+ if (likely(v->version >= 1)) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ goto release_ret_r;
+ }
+ }
+
+ r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ goto release_ret_r;
+ }
+
+ if (!v->version) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ goto release_ret_r;
+ }
+ }
+
+ result = io_real_digest(v, io);
+ r = crypto_shash_final(desc, result);
+ if (r < 0) {
+ DMERR("crypto_shash_final failed: %d", r);
+ goto release_ret_r;
+ }
+ if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
+ if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
+ hash_block)) {
+ r = -EIO;
+ goto release_ret_r;
+ }
+ } else
+ aux->hash_verified = 1;
+ }
+
+ data += offset;
+
+ memcpy(io_want_digest(v, io), data, v->digest_size);
+
+ dm_bufio_release(buf);
+ return 0;
+
+release_ret_r:
+ dm_bufio_release(buf);
+
+ return r;
+}
+
+/*
+ * Verify one "dm_verity_io" structure.
+ */
+static int verity_verify_io(struct dm_verity_io *io)
+{
+ struct dm_verity *v = io->v;
+ struct bio *bio = dm_bio_from_per_bio_data(io,
+ v->ti->per_bio_data_size);
+ unsigned b;
+ int i;
+
+ for (b = 0; b < io->n_blocks; b++) {
+ struct shash_desc *desc;
+ u8 *result;
+ int r;
+ unsigned todo;
+
+ if (likely(v->levels)) {
+ /*
+ * First, we try to get the requested hash for
+ * the current block. If the hash block itself is
+ * verified, zero is returned. If it isn't, this
+ * function returns 0 and we fall back to whole
+ * chain verification.
+ */
+ int r = verity_verify_level(io, io->block + b, 0, true);
+ if (likely(!r))
+ goto test_block_hash;
+ if (r < 0)
+ return r;
+ }
+
+ memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
+
+ for (i = v->levels - 1; i >= 0; i--) {
+ int r = verity_verify_level(io, io->block + b, i, false);
+ if (unlikely(r))
+ return r;
+ }
+
+test_block_hash:
+ desc = io_hash_desc(v, io);
+ desc->tfm = v->tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ r = crypto_shash_init(desc);
+ if (r < 0) {
+ DMERR("crypto_shash_init failed: %d", r);
+ return r;
+ }
+
+ if (likely(v->version >= 1)) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ }
+ todo = 1 << v->data_dev_block_bits;
+ do {
+ u8 *page;
+ unsigned len;
+ struct bio_vec bv = bio_iter_iovec(bio, io->iter);
+
+ page = kmap_atomic(bv.bv_page);
+ len = bv.bv_len;
+ if (likely(len >= todo))
+ len = todo;
+ r = crypto_shash_update(desc, page + bv.bv_offset, len);
+ kunmap_atomic(page);
+
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+
+ bio_advance_iter(bio, &io->iter, len);
+ todo -= len;
+ } while (todo);
+
+ if (!v->version) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ }
+
+ result = io_real_digest(v, io);
+ r = crypto_shash_final(desc, result);
+ if (r < 0) {
+ DMERR("crypto_shash_final failed: %d", r);
+ return r;
+ }
+ if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
+ if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+ io->block + b))
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * End one "io" structure with a given error.
+ */
+static void verity_finish_io(struct dm_verity_io *io, int error)
+{
+ struct dm_verity *v = io->v;
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+
+ bio->bi_end_io = io->orig_bi_end_io;
+ bio->bi_private = io->orig_bi_private;
+
+ bio_endio_nodec(bio, error);
+}
+
+static void verity_work(struct work_struct *w)
+{
+ struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
+
+ verity_finish_io(io, verity_verify_io(io));
+}
+
+static void verity_end_io(struct bio *bio, int error)
+{
+ struct dm_verity_io *io = bio->bi_private;
+
+ if (error) {
+ verity_finish_io(io, error);
+ return;
+ }
+
+ INIT_WORK(&io->work, verity_work);
+ queue_work(io->v->verify_wq, &io->work);
+}
+
+/*
+ * Prefetch buffers for the specified io.
+ * The root buffer is not prefetched, it is assumed that it will be cached
+ * all the time.
+ */
+static void verity_prefetch_io(struct work_struct *work)
+{
+ struct dm_verity_prefetch_work *pw =
+ container_of(work, struct dm_verity_prefetch_work, work);
+ struct dm_verity *v = pw->v;
+ int i;
+
+ for (i = v->levels - 2; i >= 0; i--) {
+ sector_t hash_block_start;
+ sector_t hash_block_end;
+ verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
+ verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
+ if (!i) {
+ unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
+
+ cluster >>= v->data_dev_block_bits;
+ if (unlikely(!cluster))
+ goto no_prefetch_cluster;
+
+ if (unlikely(cluster & (cluster - 1)))
+ cluster = 1 << __fls(cluster);
+
+ hash_block_start &= ~(sector_t)(cluster - 1);
+ hash_block_end |= cluster - 1;
+ if (unlikely(hash_block_end >= v->hash_blocks))
+ hash_block_end = v->hash_blocks - 1;
+ }
+no_prefetch_cluster:
+ dm_bufio_prefetch(v->bufio, hash_block_start,
+ hash_block_end - hash_block_start + 1);
+ }
+
+ kfree(pw);
+}
+
+static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
+{
+ struct dm_verity_prefetch_work *pw;
+
+ pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (!pw)
+ return;
+
+ INIT_WORK(&pw->work, verity_prefetch_io);
+ pw->v = v;
+ pw->block = io->block;
+ pw->n_blocks = io->n_blocks;
+ queue_work(v->verify_wq, &pw->work);
+}
+
+/*
+ * Bio map function. It allocates dm_verity_io structure and bio vector and
+ * fills them. Then it issues prefetches and the I/O.
+ */
+static int verity_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dm_verity *v = ti->private;
+ struct dm_verity_io *io;
+
+ bio->bi_bdev = v->data_dev->bdev;
+ bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector);
+
+ if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
+ ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
+ DMERR_LIMIT("unaligned io");
+ return -EIO;
+ }
+
+ if (bio_end_sector(bio) >>
+ (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
+ DMERR_LIMIT("io out of range");
+ return -EIO;
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ return -EIO;
+
+ io = dm_per_bio_data(bio, ti->per_bio_data_size);
+ io->v = v;
+ io->orig_bi_end_io = bio->bi_end_io;
+ io->orig_bi_private = bio->bi_private;
+ io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
+ io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
+
+ bio->bi_end_io = verity_end_io;
+ bio->bi_private = io;
+ io->iter = bio->bi_iter;
+
+ verity_submit_prefetch(v, io);
+
+ generic_make_request(bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+/*
+ * Status: V (valid) or C (corruption found)
+ */
+static void verity_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct dm_verity *v = ti->private;
+ unsigned sz = 0;
+ unsigned x;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%c", v->hash_failed ? 'C' : 'V');
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %s %s %u %u %llu %llu %s ",
+ v->version,
+ v->data_dev->name,
+ v->hash_dev->name,
+ 1 << v->data_dev_block_bits,
+ 1 << v->hash_dev_block_bits,
+ (unsigned long long)v->data_blocks,
+ (unsigned long long)v->hash_start,
+ v->alg_name
+ );
+ for (x = 0; x < v->digest_size; x++)
+ DMEMIT("%02x", v->root_digest[x]);
+ DMEMIT(" ");
+ if (!v->salt_size)
+ DMEMIT("-");
+ else
+ for (x = 0; x < v->salt_size; x++)
+ DMEMIT("%02x", v->salt[x]);
+ if (v->mode != DM_VERITY_MODE_EIO) {
+ DMEMIT(" 1 ");
+ switch (v->mode) {
+ case DM_VERITY_MODE_LOGGING:
+ DMEMIT(DM_VERITY_OPT_LOGGING);
+ break;
+ case DM_VERITY_MODE_RESTART:
+ DMEMIT(DM_VERITY_OPT_RESTART);
+ break;
+ default:
+ BUG();
+ }
+ }
+ break;
+ }
+}
+
+static int verity_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg)
+{
+ struct dm_verity *v = ti->private;
+ int r = 0;
+
+ if (v->data_start ||
+ ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
+ cmd, arg);
+}
+
+static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct dm_verity *v = ti->private;
+ struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = v->data_dev->bdev;
+ bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int verity_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dm_verity *v = ti->private;
+
+ return fn(ti, v->data_dev, v->data_start, ti->len, data);
+}
+
+static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct dm_verity *v = ti->private;
+
+ if (limits->logical_block_size < 1 << v->data_dev_block_bits)
+ limits->logical_block_size = 1 << v->data_dev_block_bits;
+
+ if (limits->physical_block_size < 1 << v->data_dev_block_bits)
+ limits->physical_block_size = 1 << v->data_dev_block_bits;
+
+ blk_limits_io_min(limits, limits->logical_block_size);
+}
+
+static void verity_dtr(struct dm_target *ti)
+{
+ struct dm_verity *v = ti->private;
+
+ if (v->verify_wq)
+ destroy_workqueue(v->verify_wq);
+
+ if (v->vec_mempool)
+ mempool_destroy(v->vec_mempool);
+
+ if (v->bufio)
+ dm_bufio_client_destroy(v->bufio);
+
+ kfree(v->salt);
+ kfree(v->root_digest);
+
+ if (v->tfm)
+ crypto_free_shash(v->tfm);
+
+ kfree(v->alg_name);
+
+ if (v->hash_dev)
+ dm_put_device(ti, v->hash_dev);
+
+ if (v->data_dev)
+ dm_put_device(ti, v->data_dev);
+
+ kfree(v);
+}
+
+/*
+ * Target parameters:
+ * <version> The current format is version 1.
+ * Vsn 0 is compatible with original Chromium OS releases.
+ * <data device>
+ * <hash device>
+ * <data block size>
+ * <hash block size>
+ * <the number of data blocks>
+ * <hash start block>
+ * <algorithm>
+ * <digest>
+ * <salt> Hex string or "-" if no salt.
+ */
+static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct dm_verity *v;
+ struct dm_arg_set as;
+ const char *opt_string;
+ unsigned int num, opt_params;
+ unsigned long long num_ll;
+ int r;
+ int i;
+ sector_t hash_position;
+ char dummy;
+
+ static struct dm_arg _args[] = {
+ {0, 1, "Invalid number of feature args"},
+ };
+
+ v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
+ if (!v) {
+ ti->error = "Cannot allocate verity structure";
+ return -ENOMEM;
+ }
+ ti->private = v;
+ v->ti = ti;
+
+ if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
+ ti->error = "Device must be readonly";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (argc < 10) {
+ ti->error = "Not enough arguments";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 ||
+ num > 1) {
+ ti->error = "Invalid version";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->version = num;
+
+ r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
+ if (r) {
+ ti->error = "Data device lookup failed";
+ goto bad;
+ }
+
+ r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
+ if (r) {
+ ti->error = "Data device lookup failed";
+ goto bad;
+ }
+
+ if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
+ !num || (num & (num - 1)) ||
+ num < bdev_logical_block_size(v->data_dev->bdev) ||
+ num > PAGE_SIZE) {
+ ti->error = "Invalid data device block size";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->data_dev_block_bits = __ffs(num);
+
+ if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
+ !num || (num & (num - 1)) ||
+ num < bdev_logical_block_size(v->hash_dev->bdev) ||
+ num > INT_MAX) {
+ ti->error = "Invalid hash device block size";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->hash_dev_block_bits = __ffs(num);
+
+ if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
+ (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
+ >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) {
+ ti->error = "Invalid data blocks";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->data_blocks = num_ll;
+
+ if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
+ ti->error = "Data device is too small";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
+ (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT))
+ >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) {
+ ti->error = "Invalid hash start";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->hash_start = num_ll;
+
+ v->alg_name = kstrdup(argv[7], GFP_KERNEL);
+ if (!v->alg_name) {
+ ti->error = "Cannot allocate algorithm name";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
+ if (IS_ERR(v->tfm)) {
+ ti->error = "Cannot initialize hash function";
+ r = PTR_ERR(v->tfm);
+ v->tfm = NULL;
+ goto bad;
+ }
+ v->digest_size = crypto_shash_digestsize(v->tfm);
+ if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
+ ti->error = "Digest size too big";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->shash_descsize =
+ sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
+
+ v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
+ if (!v->root_digest) {
+ ti->error = "Cannot allocate root digest";
+ r = -ENOMEM;
+ goto bad;
+ }
+ if (strlen(argv[8]) != v->digest_size * 2 ||
+ hex2bin(v->root_digest, argv[8], v->digest_size)) {
+ ti->error = "Invalid root digest";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (strcmp(argv[9], "-")) {
+ v->salt_size = strlen(argv[9]) / 2;
+ v->salt = kmalloc(v->salt_size, GFP_KERNEL);
+ if (!v->salt) {
+ ti->error = "Cannot allocate salt";
+ r = -ENOMEM;
+ goto bad;
+ }
+ if (strlen(argv[9]) != v->salt_size * 2 ||
+ hex2bin(v->salt, argv[9], v->salt_size)) {
+ ti->error = "Invalid salt";
+ r = -EINVAL;
+ goto bad;
+ }
+ }
+
+ argv += 10;
+ argc -= 10;
+
+ /* Optional parameters */
+ if (argc) {
+ as.argc = argc;
+ as.argv = argv;
+
+ r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+ if (r)
+ goto bad;
+
+ while (opt_params) {
+ opt_params--;
+ opt_string = dm_shift_arg(&as);
+ if (!opt_string) {
+ ti->error = "Not enough feature arguments";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
+ v->mode = DM_VERITY_MODE_LOGGING;
+ else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
+ v->mode = DM_VERITY_MODE_RESTART;
+ else {
+ ti->error = "Invalid feature arguments";
+ r = -EINVAL;
+ goto bad;
+ }
+ }
+ }
+
+ v->hash_per_block_bits =
+ __fls((1 << v->hash_dev_block_bits) / v->digest_size);
+
+ v->levels = 0;
+ if (v->data_blocks)
+ while (v->hash_per_block_bits * v->levels < 64 &&
+ (unsigned long long)(v->data_blocks - 1) >>
+ (v->hash_per_block_bits * v->levels))
+ v->levels++;
+
+ if (v->levels > DM_VERITY_MAX_LEVELS) {
+ ti->error = "Too many tree levels";
+ r = -E2BIG;
+ goto bad;
+ }
+
+ hash_position = v->hash_start;
+ for (i = v->levels - 1; i >= 0; i--) {
+ sector_t s;
+ v->hash_level_block[i] = hash_position;
+ s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1)
+ >> ((i + 1) * v->hash_per_block_bits);
+ if (hash_position + s < hash_position) {
+ ti->error = "Hash device offset overflow";
+ r = -E2BIG;
+ goto bad;
+ }
+ hash_position += s;
+ }
+ v->hash_blocks = hash_position;
+
+ v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
+ 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
+ dm_bufio_alloc_callback, NULL);
+ if (IS_ERR(v->bufio)) {
+ ti->error = "Cannot initialize dm-bufio";
+ r = PTR_ERR(v->bufio);
+ v->bufio = NULL;
+ goto bad;
+ }
+
+ if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
+ ti->error = "Hash device is too small";
+ r = -E2BIG;
+ goto bad;
+ }
+
+ ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
+
+ v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
+ BIO_MAX_PAGES * sizeof(struct bio_vec));
+ if (!v->vec_mempool) {
+ ti->error = "Cannot allocate vector mempool";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ /* WQ_UNBOUND greatly improves performance when running on ramdisk */
+ v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
+ if (!v->verify_wq) {
+ ti->error = "Cannot allocate workqueue";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ return 0;
+
+bad:
+ verity_dtr(ti);
+
+ return r;
+}
+
+static struct target_type verity_target = {
+ .name = "verity",
+ .version = {1, 2, 0},
+ .module = THIS_MODULE,
+ .ctr = verity_ctr,
+ .dtr = verity_dtr,
+ .map = verity_map,
+ .status = verity_status,
+ .ioctl = verity_ioctl,
+ .merge = verity_merge,
+ .iterate_devices = verity_iterate_devices,
+ .io_hints = verity_io_hints,
+};
+
+static int __init dm_verity_init(void)
+{
+ int r;
+
+ r = dm_register_target(&verity_target);
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_verity_exit(void)
+{
+ dm_unregister_target(&verity_target);
+}
+
+module_init(dm_verity_init);
+module_exit(dm_verity_exit);
+
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
+MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
+MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
new file mode 100644
index 000000000..b9a64bbce
--- /dev/null
+++ b/drivers/md/dm-zero.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2003 Jana Saout <jana@saout.de>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+
+#define DM_MSG_PREFIX "zero"
+
+/*
+ * Construct a dummy mapping that only returns zeros
+ */
+static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ if (argc != 0) {
+ ti->error = "No arguments required";
+ return -EINVAL;
+ }
+
+ /*
+ * Silently drop discards, avoiding -EOPNOTSUPP.
+ */
+ ti->num_discard_bios = 1;
+
+ return 0;
+}
+
+/*
+ * Return zeros only on reads
+ */
+static int zero_map(struct dm_target *ti, struct bio *bio)
+{
+ switch(bio_rw(bio)) {
+ case READ:
+ zero_fill_bio(bio);
+ break;
+ case READA:
+ /* readahead of null bytes only wastes buffer cache */
+ return -EIO;
+ case WRITE:
+ /* writes get silently dropped */
+ break;
+ }
+
+ bio_endio(bio, 0);
+
+ /* accepted bio, don't make new request */
+ return DM_MAPIO_SUBMITTED;
+}
+
+static struct target_type zero_target = {
+ .name = "zero",
+ .version = {1, 1, 0},
+ .module = THIS_MODULE,
+ .ctr = zero_ctr,
+ .map = zero_map,
+};
+
+static int __init dm_zero_init(void)
+{
+ int r = dm_register_target(&zero_target);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_zero_exit(void)
+{
+ dm_unregister_target(&zero_target);
+}
+
+module_init(dm_zero_init)
+module_exit(dm_zero_exit)
+
+MODULE_AUTHOR("Jana Saout <jana@saout.de>");
+MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
new file mode 100644
index 000000000..2caf49289
--- /dev/null
+++ b/drivers/md/dm.c
@@ -0,0 +1,3650 @@
+/*
+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-uevent.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/moduleparam.h>
+#include <linux/blkpg.h>
+#include <linux/bio.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/hdreg.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/elevator.h> /* for rq_end_sector() */
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#define DM_MSG_PREFIX "core"
+
+#ifdef CONFIG_PRINTK
+/*
+ * ratelimit state to be used in DMXXX_LIMIT().
+ */
+DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+EXPORT_SYMBOL(dm_ratelimit_state);
+#endif
+
+/*
+ * Cookies are numeric values sent with CHANGE and REMOVE
+ * uevents while resuming, removing or renaming the device.
+ */
+#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
+#define DM_COOKIE_LENGTH 24
+
+static const char *_name = DM_NAME;
+
+static unsigned int major = 0;
+static unsigned int _major = 0;
+
+static DEFINE_IDR(_minor_idr);
+
+static DEFINE_SPINLOCK(_minor_lock);
+
+static void do_deferred_remove(struct work_struct *w);
+
+static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
+
+static struct workqueue_struct *deferred_remove_workqueue;
+
+/*
+ * For bio-based dm.
+ * One of these is allocated per bio.
+ */
+struct dm_io {
+ struct mapped_device *md;
+ int error;
+ atomic_t io_count;
+ struct bio *bio;
+ unsigned long start_time;
+ spinlock_t endio_lock;
+ struct dm_stats_aux stats_aux;
+};
+
+/*
+ * For request-based dm.
+ * One of these is allocated per request.
+ */
+struct dm_rq_target_io {
+ struct mapped_device *md;
+ struct dm_target *ti;
+ struct request *orig, *clone;
+ struct kthread_work work;
+ int error;
+ union map_info info;
+};
+
+/*
+ * For request-based dm - the bio clones we allocate are embedded in these
+ * structs.
+ *
+ * We allocate these with bio_alloc_bioset, using the front_pad parameter when
+ * the bioset is created - this means the bio has to come at the end of the
+ * struct.
+ */
+struct dm_rq_clone_bio_info {
+ struct bio *orig;
+ struct dm_rq_target_io *tio;
+ struct bio clone;
+};
+
+union map_info *dm_get_rq_mapinfo(struct request *rq)
+{
+ if (rq && rq->end_io_data)
+ return &((struct dm_rq_target_io *)rq->end_io_data)->info;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
+
+#define MINOR_ALLOCED ((void *)-1)
+
+/*
+ * Bits for the md->flags field.
+ */
+#define DMF_BLOCK_IO_FOR_SUSPEND 0
+#define DMF_SUSPENDED 1
+#define DMF_FROZEN 2
+#define DMF_FREEING 3
+#define DMF_DELETING 4
+#define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_MERGE_IS_OPTIONAL 6
+#define DMF_DEFERRED_REMOVE 7
+#define DMF_SUSPENDED_INTERNALLY 8
+
+/*
+ * A dummy definition to make RCU happy.
+ * struct dm_table should never be dereferenced in this file.
+ */
+struct dm_table {
+ int undefined__;
+};
+
+/*
+ * Work processed by per-device workqueue.
+ */
+struct mapped_device {
+ struct srcu_struct io_barrier;
+ struct mutex suspend_lock;
+ atomic_t holders;
+ atomic_t open_count;
+
+ /*
+ * The current mapping.
+ * Use dm_get_live_table{_fast} or take suspend_lock for
+ * dereference.
+ */
+ struct dm_table __rcu *map;
+
+ struct list_head table_devices;
+ struct mutex table_devices_lock;
+
+ unsigned long flags;
+
+ struct request_queue *queue;
+ unsigned type;
+ /* Protect queue and type against concurrent access. */
+ struct mutex type_lock;
+
+ struct target_type *immutable_target_type;
+
+ struct gendisk *disk;
+ char name[16];
+
+ void *interface_ptr;
+
+ /*
+ * A list of ios that arrived while we were suspended.
+ */
+ atomic_t pending[2];
+ wait_queue_head_t wait;
+ struct work_struct work;
+ struct bio_list deferred;
+ spinlock_t deferred_lock;
+
+ /*
+ * Processing queue (flush)
+ */
+ struct workqueue_struct *wq;
+
+ /*
+ * io objects are allocated from here.
+ */
+ mempool_t *io_pool;
+ mempool_t *rq_pool;
+
+ struct bio_set *bs;
+
+ /*
+ * Event handling.
+ */
+ atomic_t event_nr;
+ wait_queue_head_t eventq;
+ atomic_t uevent_seq;
+ struct list_head uevent_list;
+ spinlock_t uevent_lock; /* Protect access to uevent_list */
+
+ /*
+ * freeze/thaw support require holding onto a super block
+ */
+ struct super_block *frozen_sb;
+ struct block_device *bdev;
+
+ /* forced geometry settings */
+ struct hd_geometry geometry;
+
+ /* kobject and completion */
+ struct dm_kobject_holder kobj_holder;
+
+ /* zero-length flush that will be cloned and submitted to targets */
+ struct bio flush_bio;
+
+ /* the number of internal suspends */
+ unsigned internal_suspend_count;
+
+ struct dm_stats stats;
+
+ struct kthread_worker kworker;
+ struct task_struct *kworker_task;
+
+ /* for request-based merge heuristic in dm_request_fn() */
+ unsigned seq_rq_merge_deadline_usecs;
+ int last_rq_rw;
+ sector_t last_rq_pos;
+ ktime_t last_rq_start_time;
+
+ /* for blk-mq request-based DM support */
+ struct blk_mq_tag_set tag_set;
+ bool use_blk_mq;
+};
+
+#ifdef CONFIG_DM_MQ_DEFAULT
+static bool use_blk_mq = true;
+#else
+static bool use_blk_mq = false;
+#endif
+
+bool dm_use_blk_mq(struct mapped_device *md)
+{
+ return md->use_blk_mq;
+}
+
+/*
+ * For mempools pre-allocation at the table loading time.
+ */
+struct dm_md_mempools {
+ mempool_t *io_pool;
+ mempool_t *rq_pool;
+ struct bio_set *bs;
+};
+
+struct table_device {
+ struct list_head list;
+ atomic_t count;
+ struct dm_dev dm_dev;
+};
+
+#define RESERVED_BIO_BASED_IOS 16
+#define RESERVED_REQUEST_BASED_IOS 256
+#define RESERVED_MAX_IOS 1024
+static struct kmem_cache *_io_cache;
+static struct kmem_cache *_rq_tio_cache;
+static struct kmem_cache *_rq_cache;
+
+/*
+ * Bio-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
+
+/*
+ * Request-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+
+static unsigned __dm_get_module_param(unsigned *module_param,
+ unsigned def, unsigned max)
+{
+ unsigned param = ACCESS_ONCE(*module_param);
+ unsigned modified_param = 0;
+
+ if (!param)
+ modified_param = def;
+ else if (param > max)
+ modified_param = max;
+
+ if (modified_param) {
+ (void)cmpxchg(module_param, param, modified_param);
+ param = modified_param;
+ }
+
+ return param;
+}
+
+unsigned dm_get_reserved_bio_based_ios(void)
+{
+ return __dm_get_module_param(&reserved_bio_based_ios,
+ RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
+
+unsigned dm_get_reserved_rq_based_ios(void)
+{
+ return __dm_get_module_param(&reserved_rq_based_ios,
+ RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+
+static int __init local_init(void)
+{
+ int r = -ENOMEM;
+
+ /* allocate a slab for the dm_ios */
+ _io_cache = KMEM_CACHE(dm_io, 0);
+ if (!_io_cache)
+ return r;
+
+ _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
+ if (!_rq_tio_cache)
+ goto out_free_io_cache;
+
+ _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
+ __alignof__(struct request), 0, NULL);
+ if (!_rq_cache)
+ goto out_free_rq_tio_cache;
+
+ r = dm_uevent_init();
+ if (r)
+ goto out_free_rq_cache;
+
+ deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
+ if (!deferred_remove_workqueue) {
+ r = -ENOMEM;
+ goto out_uevent_exit;
+ }
+
+ _major = major;
+ r = register_blkdev(_major, _name);
+ if (r < 0)
+ goto out_free_workqueue;
+
+ if (!_major)
+ _major = r;
+
+ return 0;
+
+out_free_workqueue:
+ destroy_workqueue(deferred_remove_workqueue);
+out_uevent_exit:
+ dm_uevent_exit();
+out_free_rq_cache:
+ kmem_cache_destroy(_rq_cache);
+out_free_rq_tio_cache:
+ kmem_cache_destroy(_rq_tio_cache);
+out_free_io_cache:
+ kmem_cache_destroy(_io_cache);
+
+ return r;
+}
+
+static void local_exit(void)
+{
+ flush_scheduled_work();
+ destroy_workqueue(deferred_remove_workqueue);
+
+ kmem_cache_destroy(_rq_cache);
+ kmem_cache_destroy(_rq_tio_cache);
+ kmem_cache_destroy(_io_cache);
+ unregister_blkdev(_major, _name);
+ dm_uevent_exit();
+
+ _major = 0;
+
+ DMINFO("cleaned up");
+}
+
+static int (*_inits[])(void) __initdata = {
+ local_init,
+ dm_target_init,
+ dm_linear_init,
+ dm_stripe_init,
+ dm_io_init,
+ dm_kcopyd_init,
+ dm_interface_init,
+ dm_statistics_init,
+};
+
+static void (*_exits[])(void) = {
+ local_exit,
+ dm_target_exit,
+ dm_linear_exit,
+ dm_stripe_exit,
+ dm_io_exit,
+ dm_kcopyd_exit,
+ dm_interface_exit,
+ dm_statistics_exit,
+};
+
+static int __init dm_init(void)
+{
+ const int count = ARRAY_SIZE(_inits);
+
+ int r, i;
+
+ for (i = 0; i < count; i++) {
+ r = _inits[i]();
+ if (r)
+ goto bad;
+ }
+
+ return 0;
+
+ bad:
+ while (i--)
+ _exits[i]();
+
+ return r;
+}
+
+static void __exit dm_exit(void)
+{
+ int i = ARRAY_SIZE(_exits);
+
+ while (i--)
+ _exits[i]();
+
+ /*
+ * Should be empty by this point.
+ */
+ idr_destroy(&_minor_idr);
+}
+
+/*
+ * Block device functions
+ */
+int dm_deleting_md(struct mapped_device *md)
+{
+ return test_bit(DMF_DELETING, &md->flags);
+}
+
+static int dm_blk_open(struct block_device *bdev, fmode_t mode)
+{
+ struct mapped_device *md;
+
+ spin_lock(&_minor_lock);
+
+ md = bdev->bd_disk->private_data;
+ if (!md)
+ goto out;
+
+ if (test_bit(DMF_FREEING, &md->flags) ||
+ dm_deleting_md(md)) {
+ md = NULL;
+ goto out;
+ }
+
+ dm_get(md);
+ atomic_inc(&md->open_count);
+out:
+ spin_unlock(&_minor_lock);
+
+ return md ? 0 : -ENXIO;
+}
+
+static void dm_blk_close(struct gendisk *disk, fmode_t mode)
+{
+ struct mapped_device *md;
+
+ spin_lock(&_minor_lock);
+
+ md = disk->private_data;
+ if (WARN_ON(!md))
+ goto out;
+
+ if (atomic_dec_and_test(&md->open_count) &&
+ (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
+ queue_work(deferred_remove_workqueue, &deferred_remove_work);
+
+ dm_put(md);
+out:
+ spin_unlock(&_minor_lock);
+}
+
+int dm_open_count(struct mapped_device *md)
+{
+ return atomic_read(&md->open_count);
+}
+
+/*
+ * Guarantees nothing is using the device before it's deleted.
+ */
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
+{
+ int r = 0;
+
+ spin_lock(&_minor_lock);
+
+ if (dm_open_count(md)) {
+ r = -EBUSY;
+ if (mark_deferred)
+ set_bit(DMF_DEFERRED_REMOVE, &md->flags);
+ } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
+ r = -EEXIST;
+ else
+ set_bit(DMF_DELETING, &md->flags);
+
+ spin_unlock(&_minor_lock);
+
+ return r;
+}
+
+int dm_cancel_deferred_remove(struct mapped_device *md)
+{
+ int r = 0;
+
+ spin_lock(&_minor_lock);
+
+ if (test_bit(DMF_DELETING, &md->flags))
+ r = -EBUSY;
+ else
+ clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
+
+ spin_unlock(&_minor_lock);
+
+ return r;
+}
+
+static void do_deferred_remove(struct work_struct *w)
+{
+ dm_deferred_remove();
+}
+
+sector_t dm_get_size(struct mapped_device *md)
+{
+ return get_capacity(md->disk);
+}
+
+struct request_queue *dm_get_md_queue(struct mapped_device *md)
+{
+ return md->queue;
+}
+
+struct dm_stats *dm_get_stats(struct mapped_device *md)
+{
+ return &md->stats;
+}
+
+static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+
+ return dm_get_geometry(md, geo);
+}
+
+static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ int srcu_idx;
+ struct dm_table *map;
+ struct dm_target *tgt;
+ int r = -ENOTTY;
+
+retry:
+ map = dm_get_live_table(md, &srcu_idx);
+
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ /* We only support devices that have a single target */
+ if (dm_table_get_num_targets(map) != 1)
+ goto out;
+
+ tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->ioctl)
+ goto out;
+
+ if (dm_suspended_md(md)) {
+ r = -EAGAIN;
+ goto out;
+ }
+
+ r = tgt->type->ioctl(tgt, cmd, arg);
+
+out:
+ dm_put_live_table(md, srcu_idx);
+
+ if (r == -ENOTCONN) {
+ msleep(10);
+ goto retry;
+ }
+
+ return r;
+}
+
+static struct dm_io *alloc_io(struct mapped_device *md)
+{
+ return mempool_alloc(md->io_pool, GFP_NOIO);
+}
+
+static void free_io(struct mapped_device *md, struct dm_io *io)
+{
+ mempool_free(io, md->io_pool);
+}
+
+static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
+{
+ bio_put(&tio->clone);
+}
+
+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
+ gfp_t gfp_mask)
+{
+ return mempool_alloc(md->io_pool, gfp_mask);
+}
+
+static void free_rq_tio(struct dm_rq_target_io *tio)
+{
+ mempool_free(tio, tio->md->io_pool);
+}
+
+static struct request *alloc_clone_request(struct mapped_device *md,
+ gfp_t gfp_mask)
+{
+ return mempool_alloc(md->rq_pool, gfp_mask);
+}
+
+static void free_clone_request(struct mapped_device *md, struct request *rq)
+{
+ mempool_free(rq, md->rq_pool);
+}
+
+static int md_in_flight(struct mapped_device *md)
+{
+ return atomic_read(&md->pending[READ]) +
+ atomic_read(&md->pending[WRITE]);
+}
+
+static void start_io_acct(struct dm_io *io)
+{
+ struct mapped_device *md = io->md;
+ struct bio *bio = io->bio;
+ int cpu;
+ int rw = bio_data_dir(bio);
+
+ io->start_time = jiffies;
+
+ cpu = part_stat_lock();
+ part_round_stats(cpu, &dm_disk(md)->part0);
+ part_stat_unlock();
+ atomic_set(&dm_disk(md)->part0.in_flight[rw],
+ atomic_inc_return(&md->pending[rw]));
+
+ if (unlikely(dm_stats_used(&md->stats)))
+ dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
+ bio_sectors(bio), false, 0, &io->stats_aux);
+}
+
+static void end_io_acct(struct dm_io *io)
+{
+ struct mapped_device *md = io->md;
+ struct bio *bio = io->bio;
+ unsigned long duration = jiffies - io->start_time;
+ int pending;
+ int rw = bio_data_dir(bio);
+
+ generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
+
+ if (unlikely(dm_stats_used(&md->stats)))
+ dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
+ bio_sectors(bio), true, duration, &io->stats_aux);
+
+ /*
+ * After this is decremented the bio must not be touched if it is
+ * a flush.
+ */
+ pending = atomic_dec_return(&md->pending[rw]);
+ atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
+ pending += atomic_read(&md->pending[rw^0x1]);
+
+ /* nudge anyone waiting on suspend queue */
+ if (!pending)
+ wake_up(&md->wait);
+}
+
+/*
+ * Add the bio to the list of deferred io.
+ */
+static void queue_io(struct mapped_device *md, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&md->deferred_lock, flags);
+ bio_list_add(&md->deferred, bio);
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
+ queue_work(md->wq, &md->work);
+}
+
+/*
+ * Everyone (including functions in this file), should use this
+ * function to access the md->map field, and make sure they call
+ * dm_put_live_table() when finished.
+ */
+struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
+{
+ *srcu_idx = srcu_read_lock(&md->io_barrier);
+
+ return srcu_dereference(md->map, &md->io_barrier);
+}
+
+void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
+{
+ srcu_read_unlock(&md->io_barrier, srcu_idx);
+}
+
+void dm_sync_table(struct mapped_device *md)
+{
+ synchronize_srcu(&md->io_barrier);
+ synchronize_rcu_expedited();
+}
+
+/*
+ * A fast alternative to dm_get_live_table/dm_put_live_table.
+ * The caller must not block between these two functions.
+ */
+static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
+{
+ rcu_read_lock();
+ return rcu_dereference(md->map);
+}
+
+static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+/*
+ * Open a table device so we can use it as a map destination.
+ */
+static int open_table_device(struct table_device *td, dev_t dev,
+ struct mapped_device *md)
+{
+ static char *_claim_ptr = "I belong to device-mapper";
+ struct block_device *bdev;
+
+ int r;
+
+ BUG_ON(td->dm_dev.bdev);
+
+ bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+
+ r = bd_link_disk_holder(bdev, dm_disk(md));
+ if (r) {
+ blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
+ return r;
+ }
+
+ td->dm_dev.bdev = bdev;
+ return 0;
+}
+
+/*
+ * Close a table device that we've been using.
+ */
+static void close_table_device(struct table_device *td, struct mapped_device *md)
+{
+ if (!td->dm_dev.bdev)
+ return;
+
+ bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
+ blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+ td->dm_dev.bdev = NULL;
+}
+
+static struct table_device *find_table_device(struct list_head *l, dev_t dev,
+ fmode_t mode) {
+ struct table_device *td;
+
+ list_for_each_entry(td, l, list)
+ if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
+ return td;
+
+ return NULL;
+}
+
+int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
+ struct dm_dev **result) {
+ int r;
+ struct table_device *td;
+
+ mutex_lock(&md->table_devices_lock);
+ td = find_table_device(&md->table_devices, dev, mode);
+ if (!td) {
+ td = kmalloc(sizeof(*td), GFP_KERNEL);
+ if (!td) {
+ mutex_unlock(&md->table_devices_lock);
+ return -ENOMEM;
+ }
+
+ td->dm_dev.mode = mode;
+ td->dm_dev.bdev = NULL;
+
+ if ((r = open_table_device(td, dev, md))) {
+ mutex_unlock(&md->table_devices_lock);
+ kfree(td);
+ return r;
+ }
+
+ format_dev_t(td->dm_dev.name, dev);
+
+ atomic_set(&td->count, 0);
+ list_add(&td->list, &md->table_devices);
+ }
+ atomic_inc(&td->count);
+ mutex_unlock(&md->table_devices_lock);
+
+ *result = &td->dm_dev;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_get_table_device);
+
+void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
+{
+ struct table_device *td = container_of(d, struct table_device, dm_dev);
+
+ mutex_lock(&md->table_devices_lock);
+ if (atomic_dec_and_test(&td->count)) {
+ close_table_device(td, md);
+ list_del(&td->list);
+ kfree(td);
+ }
+ mutex_unlock(&md->table_devices_lock);
+}
+EXPORT_SYMBOL(dm_put_table_device);
+
+static void free_table_devices(struct list_head *devices)
+{
+ struct list_head *tmp, *next;
+
+ list_for_each_safe(tmp, next, devices) {
+ struct table_device *td = list_entry(tmp, struct table_device, list);
+
+ DMWARN("dm_destroy: %s still exists with %d references",
+ td->dm_dev.name, atomic_read(&td->count));
+ kfree(td);
+ }
+}
+
+/*
+ * Get the geometry associated with a dm device
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+ *geo = md->geometry;
+
+ return 0;
+}
+
+/*
+ * Set the geometry of a device.
+ */
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+ sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
+
+ if (geo->start > sz) {
+ DMWARN("Start sector is beyond the geometry limits.");
+ return -EINVAL;
+ }
+
+ md->geometry = *geo;
+
+ return 0;
+}
+
+/*-----------------------------------------------------------------
+ * CRUD START:
+ * A more elegant soln is in the works that uses the queue
+ * merge fn, unfortunately there are a couple of changes to
+ * the block layer that I want to make for this. So in the
+ * interests of getting something for people to use I give
+ * you this clearly demarcated crap.
+ *---------------------------------------------------------------*/
+
+static int __noflush_suspending(struct mapped_device *md)
+{
+ return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+}
+
+/*
+ * Decrements the number of outstanding ios that a bio has been
+ * cloned into, completing the original io if necc.
+ */
+static void dec_pending(struct dm_io *io, int error)
+{
+ unsigned long flags;
+ int io_error;
+ struct bio *bio;
+ struct mapped_device *md = io->md;
+
+ /* Push-back supersedes any I/O errors */
+ if (unlikely(error)) {
+ spin_lock_irqsave(&io->endio_lock, flags);
+ if (!(io->error > 0 && __noflush_suspending(md)))
+ io->error = error;
+ spin_unlock_irqrestore(&io->endio_lock, flags);
+ }
+
+ if (atomic_dec_and_test(&io->io_count)) {
+ if (io->error == DM_ENDIO_REQUEUE) {
+ /*
+ * Target requested pushing back the I/O.
+ */
+ spin_lock_irqsave(&md->deferred_lock, flags);
+ if (__noflush_suspending(md))
+ bio_list_add_head(&md->deferred, io->bio);
+ else
+ /* noflush suspend was interrupted. */
+ io->error = -EIO;
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
+ }
+
+ io_error = io->error;
+ bio = io->bio;
+ end_io_acct(io);
+ free_io(md, io);
+
+ if (io_error == DM_ENDIO_REQUEUE)
+ return;
+
+ if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
+ /*
+ * Preflush done for flush with data, reissue
+ * without REQ_FLUSH.
+ */
+ bio->bi_rw &= ~REQ_FLUSH;
+ queue_io(md, bio);
+ } else {
+ /* done with normal IO or empty flush */
+ trace_block_bio_complete(md->queue, bio, io_error);
+ bio_endio(bio, io_error);
+ }
+ }
+}
+
+static void disable_write_same(struct mapped_device *md)
+{
+ struct queue_limits *limits = dm_get_queue_limits(md);
+
+ /* device doesn't really support WRITE SAME, disable it */
+ limits->max_write_same_sectors = 0;
+}
+
+static void clone_endio(struct bio *bio, int error)
+{
+ int r = error;
+ struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+ struct dm_io *io = tio->io;
+ struct mapped_device *md = tio->io->md;
+ dm_endio_fn endio = tio->ti->type->end_io;
+
+ if (!bio_flagged(bio, BIO_UPTODATE) && !error)
+ error = -EIO;
+
+ if (endio) {
+ r = endio(tio->ti, bio, error);
+ if (r < 0 || r == DM_ENDIO_REQUEUE)
+ /*
+ * error and requeue request are handled
+ * in dec_pending().
+ */
+ error = r;
+ else if (r == DM_ENDIO_INCOMPLETE)
+ /* The target will handle the io */
+ return;
+ else if (r) {
+ DMWARN("unimplemented target endio return value: %d", r);
+ BUG();
+ }
+ }
+
+ if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
+ disable_write_same(md);
+
+ free_tio(md, tio);
+ dec_pending(io, error);
+}
+
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+ struct dm_rq_clone_bio_info *info =
+ container_of(clone, struct dm_rq_clone_bio_info, clone);
+ struct dm_rq_target_io *tio = info->tio;
+ struct bio *bio = info->orig;
+ unsigned int nr_bytes = info->orig->bi_iter.bi_size;
+
+ bio_put(clone);
+
+ if (tio->error)
+ /*
+ * An error has already been detected on the request.
+ * Once error occurred, just let clone->end_io() handle
+ * the remainder.
+ */
+ return;
+ else if (error) {
+ /*
+ * Don't notice the error to the upper layer yet.
+ * The error handling decision is made by the target driver,
+ * when the request is completed.
+ */
+ tio->error = error;
+ return;
+ }
+
+ /*
+ * I/O for the bio successfully completed.
+ * Notice the data completion to the upper layer.
+ */
+
+ /*
+ * bios are processed from the head of the list.
+ * So the completing bio should always be rq->bio.
+ * If it's not, something wrong is happening.
+ */
+ if (tio->orig->bio != bio)
+ DMERR("bio completion is going in the middle of the request");
+
+ /*
+ * Update the original request.
+ * Do not use blk_end_request() here, because it may complete
+ * the original request before the clone, and break the ordering.
+ */
+ blk_update_request(tio->orig, 0, nr_bytes);
+}
+
+static struct dm_rq_target_io *tio_from_request(struct request *rq)
+{
+ return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+}
+
+/*
+ * Don't touch any member of the md after calling this function because
+ * the md may be freed in dm_put() at the end of this function.
+ * Or do dm_get() before calling this function and dm_put() later.
+ */
+static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
+{
+ int nr_requests_pending;
+
+ atomic_dec(&md->pending[rw]);
+
+ /* nudge anyone waiting on suspend queue */
+ nr_requests_pending = md_in_flight(md);
+ if (!nr_requests_pending)
+ wake_up(&md->wait);
+
+ /*
+ * Run this off this callpath, as drivers could invoke end_io while
+ * inside their request_fn (and holding the queue lock). Calling
+ * back into ->request_fn() could deadlock attempting to grab the
+ * queue lock again.
+ */
+ if (run_queue) {
+ if (md->queue->mq_ops)
+ blk_mq_run_hw_queues(md->queue, true);
+ else if (!nr_requests_pending ||
+ (nr_requests_pending >= md->queue->nr_congestion_on))
+ blk_run_queue_async(md->queue);
+ }
+
+ /*
+ * dm_put() must be at the end of this function. See the comment above
+ */
+ dm_put(md);
+}
+
+static void free_rq_clone(struct request *clone)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+ struct mapped_device *md = tio->md;
+
+ blk_rq_unprep_clone(clone);
+
+ if (md->type == DM_TYPE_MQ_REQUEST_BASED)
+ /* stacked on blk-mq queue(s) */
+ tio->ti->type->release_clone_rq(clone);
+ else if (!md->queue->mq_ops)
+ /* request_fn queue stacked on request_fn queue(s) */
+ free_clone_request(md, clone);
+ /*
+ * NOTE: for the blk-mq queue stacked on request_fn queue(s) case:
+ * no need to call free_clone_request() because we leverage blk-mq by
+ * allocating the clone at the end of the blk-mq pdu (see: clone_rq)
+ */
+
+ if (!md->queue->mq_ops)
+ free_rq_tio(tio);
+}
+
+/*
+ * Complete the clone and the original request.
+ * Must be called without clone's queue lock held,
+ * see end_clone_request() for more details.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+ int rw = rq_data_dir(clone);
+ struct dm_rq_target_io *tio = clone->end_io_data;
+ struct mapped_device *md = tio->md;
+ struct request *rq = tio->orig;
+
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+ rq->errors = clone->errors;
+ rq->resid_len = clone->resid_len;
+
+ if (rq->sense)
+ /*
+ * We are using the sense buffer of the original
+ * request.
+ * So setting the length of the sense data is enough.
+ */
+ rq->sense_len = clone->sense_len;
+ }
+
+ free_rq_clone(clone);
+ if (!rq->q->mq_ops)
+ blk_end_request_all(rq, error);
+ else
+ blk_mq_end_request(rq, error);
+ rq_completed(md, rw, true);
+}
+
+static void dm_unprep_request(struct request *rq)
+{
+ struct dm_rq_target_io *tio = tio_from_request(rq);
+ struct request *clone = tio->clone;
+
+ if (!rq->q->mq_ops) {
+ rq->special = NULL;
+ rq->cmd_flags &= ~REQ_DONTPREP;
+ }
+
+ if (clone)
+ free_rq_clone(clone);
+}
+
+/*
+ * Requeue the original request of a clone.
+ */
+static void old_requeue_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ blk_requeue_request(q, rq);
+ blk_run_queue_async(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+ struct request *rq)
+{
+ int rw = rq_data_dir(rq);
+
+ dm_unprep_request(rq);
+
+ if (!rq->q->mq_ops)
+ old_requeue_request(rq);
+ else {
+ blk_mq_requeue_request(rq);
+ blk_mq_kick_requeue_list(rq->q);
+ }
+
+ rq_completed(md, rw, false);
+}
+
+static void dm_requeue_unmapped_request(struct request *clone)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ dm_requeue_unmapped_original_request(tio->md, tio->orig);
+}
+
+static void old_stop_queue(struct request_queue *q)
+{
+ unsigned long flags;
+
+ if (blk_queue_stopped(q))
+ return;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ blk_stop_queue(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void stop_queue(struct request_queue *q)
+{
+ if (!q->mq_ops)
+ old_stop_queue(q);
+ else
+ blk_mq_stop_hw_queues(q);
+}
+
+static void old_start_queue(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ if (blk_queue_stopped(q))
+ blk_start_queue(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void start_queue(struct request_queue *q)
+{
+ if (!q->mq_ops)
+ old_start_queue(q);
+ else
+ blk_mq_start_stopped_hw_queues(q, true);
+}
+
+static void dm_done(struct request *clone, int error, bool mapped)
+{
+ int r = error;
+ struct dm_rq_target_io *tio = clone->end_io_data;
+ dm_request_endio_fn rq_end_io = NULL;
+
+ if (tio->ti) {
+ rq_end_io = tio->ti->type->rq_end_io;
+
+ if (mapped && rq_end_io)
+ r = rq_end_io(tio->ti, clone, error, &tio->info);
+ }
+
+ if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) &&
+ !clone->q->limits.max_write_same_sectors))
+ disable_write_same(tio->md);
+
+ if (r <= 0)
+ /* The target wants to complete the I/O */
+ dm_end_request(clone, r);
+ else if (r == DM_ENDIO_INCOMPLETE)
+ /* The target will handle the I/O */
+ return;
+ else if (r == DM_ENDIO_REQUEUE)
+ /* The target wants to requeue the I/O */
+ dm_requeue_unmapped_request(clone);
+ else {
+ DMWARN("unimplemented target endio return value: %d", r);
+ BUG();
+ }
+}
+
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+ bool mapped = true;
+ struct dm_rq_target_io *tio = tio_from_request(rq);
+ struct request *clone = tio->clone;
+ int rw;
+
+ if (!clone) {
+ rw = rq_data_dir(rq);
+ if (!rq->q->mq_ops) {
+ blk_end_request_all(rq, tio->error);
+ rq_completed(tio->md, rw, false);
+ free_rq_tio(tio);
+ } else {
+ blk_mq_end_request(rq, tio->error);
+ rq_completed(tio->md, rw, false);
+ }
+ return;
+ }
+
+ if (rq->cmd_flags & REQ_FAILED)
+ mapped = false;
+
+ dm_done(clone, tio->error, mapped);
+}
+
+/*
+ * Complete the clone and the original request with the error status
+ * through softirq context.
+ */
+static void dm_complete_request(struct request *rq, int error)
+{
+ struct dm_rq_target_io *tio = tio_from_request(rq);
+
+ tio->error = error;
+ blk_complete_request(rq);
+}
+
+/*
+ * Complete the not-mapped clone and the original request with the error status
+ * through softirq context.
+ * Target's rq_end_io() function isn't called.
+ * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
+ */
+static void dm_kill_unmapped_request(struct request *rq, int error)
+{
+ rq->cmd_flags |= REQ_FAILED;
+ dm_complete_request(rq, error);
+}
+
+/*
+ * Called with the clone's queue lock held (for non-blk-mq)
+ */
+static void end_clone_request(struct request *clone, int error)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ if (!clone->q->mq_ops) {
+ /*
+ * For just cleaning up the information of the queue in which
+ * the clone was dispatched.
+ * The clone is *NOT* freed actually here because it is alloced
+ * from dm own mempool (REQ_ALLOCED isn't set).
+ */
+ __blk_put_request(clone->q, clone);
+ }
+
+ /*
+ * Actual request completion is done in a softirq context which doesn't
+ * hold the clone's queue lock. Otherwise, deadlock could occur because:
+ * - another request may be submitted by the upper level driver
+ * of the stacking during the completion
+ * - the submission which requires queue lock may be done
+ * against this clone's queue
+ */
+ dm_complete_request(tio->orig, error);
+}
+
+/*
+ * Return maximum size of I/O possible at the supplied sector up to the current
+ * target boundary.
+ */
+static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
+{
+ sector_t target_offset = dm_target_offset(ti, sector);
+
+ return ti->len - target_offset;
+}
+
+static sector_t max_io_len(sector_t sector, struct dm_target *ti)
+{
+ sector_t len = max_io_len_target_boundary(sector, ti);
+ sector_t offset, max_len;
+
+ /*
+ * Does the target need to split even further?
+ */
+ if (ti->max_io_len) {
+ offset = dm_target_offset(ti, sector);
+ if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
+ max_len = sector_div(offset, ti->max_io_len);
+ else
+ max_len = offset & (ti->max_io_len - 1);
+ max_len = ti->max_io_len - max_len;
+
+ if (len > max_len)
+ len = max_len;
+ }
+
+ return len;
+}
+
+int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
+{
+ if (len > UINT_MAX) {
+ DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
+ (unsigned long long)len, UINT_MAX);
+ ti->error = "Maximum size of target IO is too large";
+ return -EINVAL;
+ }
+
+ ti->max_io_len = (uint32_t) len;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
+
+/*
+ * A target may call dm_accept_partial_bio only from the map routine. It is
+ * allowed for all bio types except REQ_FLUSH.
+ *
+ * dm_accept_partial_bio informs the dm that the target only wants to process
+ * additional n_sectors sectors of the bio and the rest of the data should be
+ * sent in a next bio.
+ *
+ * A diagram that explains the arithmetics:
+ * +--------------------+---------------+-------+
+ * | 1 | 2 | 3 |
+ * +--------------------+---------------+-------+
+ *
+ * <-------------- *tio->len_ptr --------------->
+ * <------- bi_size ------->
+ * <-- n_sectors -->
+ *
+ * Region 1 was already iterated over with bio_advance or similar function.
+ * (it may be empty if the target doesn't use bio_advance)
+ * Region 2 is the remaining bio size that the target wants to process.
+ * (it may be empty if region 1 is non-empty, although there is no reason
+ * to make it empty)
+ * The target requires that region 3 is to be sent in the next bio.
+ *
+ * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
+ * the partially processed part (the sum of regions 1+2) must be the same for all
+ * copies of the bio.
+ */
+void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
+{
+ struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+ unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+ BUG_ON(bio->bi_rw & REQ_FLUSH);
+ BUG_ON(bi_size > *tio->len_ptr);
+ BUG_ON(n_sectors > bi_size);
+ *tio->len_ptr -= bi_size - n_sectors;
+ bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
+
+static void __map_bio(struct dm_target_io *tio)
+{
+ int r;
+ sector_t sector;
+ struct mapped_device *md;
+ struct bio *clone = &tio->clone;
+ struct dm_target *ti = tio->ti;
+
+ clone->bi_end_io = clone_endio;
+
+ /*
+ * Map the clone. If r == 0 we don't need to do
+ * anything, the target has assumed ownership of
+ * this io.
+ */
+ atomic_inc(&tio->io->io_count);
+ sector = clone->bi_iter.bi_sector;
+ r = ti->type->map(ti, clone);
+ if (r == DM_MAPIO_REMAPPED) {
+ /* the bio has been remapped so dispatch it */
+
+ trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
+ tio->io->bio->bi_bdev->bd_dev, sector);
+
+ generic_make_request(clone);
+ } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
+ /* error the io and bail out, or requeue it if needed */
+ md = tio->io->md;
+ dec_pending(tio->io, r);
+ free_tio(md, tio);
+ } else if (r) {
+ DMWARN("unimplemented target map return value: %d", r);
+ BUG();
+ }
+}
+
+struct clone_info {
+ struct mapped_device *md;
+ struct dm_table *map;
+ struct bio *bio;
+ struct dm_io *io;
+ sector_t sector;
+ unsigned sector_count;
+};
+
+static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
+{
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_size = to_bytes(len);
+}
+
+/*
+ * Creates a bio that consists of range of complete bvecs.
+ */
+static void clone_bio(struct dm_target_io *tio, struct bio *bio,
+ sector_t sector, unsigned len)
+{
+ struct bio *clone = &tio->clone;
+
+ __bio_clone_fast(clone, bio);
+
+ if (bio_integrity(bio))
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+
+ bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
+ clone->bi_iter.bi_size = to_bytes(len);
+
+ if (bio_integrity(bio))
+ bio_integrity_trim(clone, 0, len);
+}
+
+static struct dm_target_io *alloc_tio(struct clone_info *ci,
+ struct dm_target *ti,
+ unsigned target_bio_nr)
+{
+ struct dm_target_io *tio;
+ struct bio *clone;
+
+ clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
+ tio = container_of(clone, struct dm_target_io, clone);
+
+ tio->io = ci->io;
+ tio->ti = ti;
+ tio->target_bio_nr = target_bio_nr;
+
+ return tio;
+}
+
+static void __clone_and_map_simple_bio(struct clone_info *ci,
+ struct dm_target *ti,
+ unsigned target_bio_nr, unsigned *len)
+{
+ struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
+ struct bio *clone = &tio->clone;
+
+ tio->len_ptr = len;
+
+ __bio_clone_fast(clone, ci->bio);
+ if (len)
+ bio_setup_sector(clone, ci->sector, *len);
+
+ __map_bio(tio);
+}
+
+static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
+ unsigned num_bios, unsigned *len)
+{
+ unsigned target_bio_nr;
+
+ for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
+ __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
+}
+
+static int __send_empty_flush(struct clone_info *ci)
+{
+ unsigned target_nr = 0;
+ struct dm_target *ti;
+
+ BUG_ON(bio_has_data(ci->bio));
+ while ((ti = dm_table_get_target(ci->map, target_nr++)))
+ __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+
+ return 0;
+}
+
+static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
+ sector_t sector, unsigned *len)
+{
+ struct bio *bio = ci->bio;
+ struct dm_target_io *tio;
+ unsigned target_bio_nr;
+ unsigned num_target_bios = 1;
+
+ /*
+ * Does the target want to receive duplicate copies of the bio?
+ */
+ if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
+ num_target_bios = ti->num_write_bios(ti, bio);
+
+ for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
+ tio = alloc_tio(ci, ti, target_bio_nr);
+ tio->len_ptr = len;
+ clone_bio(tio, bio, sector, *len);
+ __map_bio(tio);
+ }
+}
+
+typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
+
+static unsigned get_num_discard_bios(struct dm_target *ti)
+{
+ return ti->num_discard_bios;
+}
+
+static unsigned get_num_write_same_bios(struct dm_target *ti)
+{
+ return ti->num_write_same_bios;
+}
+
+typedef bool (*is_split_required_fn)(struct dm_target *ti);
+
+static bool is_split_required_for_discard(struct dm_target *ti)
+{
+ return ti->split_discard_bios;
+}
+
+static int __send_changing_extent_only(struct clone_info *ci,
+ get_num_bios_fn get_num_bios,
+ is_split_required_fn is_split_required)
+{
+ struct dm_target *ti;
+ unsigned len;
+ unsigned num_bios;
+
+ do {
+ ti = dm_table_find_target(ci->map, ci->sector);
+ if (!dm_target_is_valid(ti))
+ return -EIO;
+
+ /*
+ * Even though the device advertised support for this type of
+ * request, that does not mean every target supports it, and
+ * reconfiguration might also have changed that since the
+ * check was performed.
+ */
+ num_bios = get_num_bios ? get_num_bios(ti) : 0;
+ if (!num_bios)
+ return -EOPNOTSUPP;
+
+ if (is_split_required && !is_split_required(ti))
+ len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+ else
+ len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
+
+ __send_duplicate_bios(ci, ti, num_bios, &len);
+
+ ci->sector += len;
+ } while (ci->sector_count -= len);
+
+ return 0;
+}
+
+static int __send_discard(struct clone_info *ci)
+{
+ return __send_changing_extent_only(ci, get_num_discard_bios,
+ is_split_required_for_discard);
+}
+
+static int __send_write_same(struct clone_info *ci)
+{
+ return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
+}
+
+/*
+ * Select the correct strategy for processing a non-flush bio.
+ */
+static int __split_and_process_non_flush(struct clone_info *ci)
+{
+ struct bio *bio = ci->bio;
+ struct dm_target *ti;
+ unsigned len;
+
+ if (unlikely(bio->bi_rw & REQ_DISCARD))
+ return __send_discard(ci);
+ else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
+ return __send_write_same(ci);
+
+ ti = dm_table_find_target(ci->map, ci->sector);
+ if (!dm_target_is_valid(ti))
+ return -EIO;
+
+ len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
+
+ __clone_and_map_data_bio(ci, ti, ci->sector, &len);
+
+ ci->sector += len;
+ ci->sector_count -= len;
+
+ return 0;
+}
+
+/*
+ * Entry point to split a bio into clones and submit them to the targets.
+ */
+static void __split_and_process_bio(struct mapped_device *md,
+ struct dm_table *map, struct bio *bio)
+{
+ struct clone_info ci;
+ int error = 0;
+
+ if (unlikely(!map)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ ci.map = map;
+ ci.md = md;
+ ci.io = alloc_io(md);
+ ci.io->error = 0;
+ atomic_set(&ci.io->io_count, 1);
+ ci.io->bio = bio;
+ ci.io->md = md;
+ spin_lock_init(&ci.io->endio_lock);
+ ci.sector = bio->bi_iter.bi_sector;
+
+ start_io_acct(ci.io);
+
+ if (bio->bi_rw & REQ_FLUSH) {
+ ci.bio = &ci.md->flush_bio;
+ ci.sector_count = 0;
+ error = __send_empty_flush(&ci);
+ /* dec_pending submits any data associated with flush */
+ } else {
+ ci.bio = bio;
+ ci.sector_count = bio_sectors(bio);
+ while (ci.sector_count && !error)
+ error = __split_and_process_non_flush(&ci);
+ }
+
+ /* drop the extra reference count */
+ dec_pending(ci.io, error);
+}
+/*-----------------------------------------------------------------
+ * CRUD END
+ *---------------------------------------------------------------*/
+
+static int dm_merge_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mapped_device *md = q->queuedata;
+ struct dm_table *map = dm_get_live_table_fast(md);
+ struct dm_target *ti;
+ sector_t max_sectors, max_size = 0;
+
+ if (unlikely(!map))
+ goto out;
+
+ ti = dm_table_find_target(map, bvm->bi_sector);
+ if (!dm_target_is_valid(ti))
+ goto out;
+
+ /*
+ * Find maximum amount of I/O that won't need splitting
+ */
+ max_sectors = min(max_io_len(bvm->bi_sector, ti),
+ (sector_t) queue_max_sectors(q));
+ max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
+
+ /*
+ * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t
+ * to the targets' merge function since it holds sectors not bytes).
+ * Just doing this as an interim fix for stable@ because the more
+ * comprehensive cleanup of switching to sector_t will impact every
+ * DM target that implements a ->merge hook.
+ */
+ if (max_size > INT_MAX)
+ max_size = INT_MAX;
+
+ /*
+ * merge_bvec_fn() returns number of bytes
+ * it can accept at this offset
+ * max is precomputed maximal io size
+ */
+ if (max_size && ti->type->merge)
+ max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
+ /*
+ * If the target doesn't support merge method and some of the devices
+ * provided their merge_bvec method (we know this by looking for the
+ * max_hw_sectors that dm_set_device_limits may set), then we can't
+ * allow bios with multiple vector entries. So always set max_size
+ * to 0, and the code below allows just one page.
+ */
+ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
+ max_size = 0;
+
+out:
+ dm_put_live_table_fast(md);
+ /*
+ * Always allow an entire first page
+ */
+ if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
+ max_size = biovec->bv_len;
+
+ return max_size;
+}
+
+/*
+ * The request function that just remaps the bio built up by
+ * dm_merge_bvec.
+ */
+static void dm_make_request(struct request_queue *q, struct bio *bio)
+{
+ int rw = bio_data_dir(bio);
+ struct mapped_device *md = q->queuedata;
+ int srcu_idx;
+ struct dm_table *map;
+
+ map = dm_get_live_table(md, &srcu_idx);
+
+ generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
+
+ /* if we're suspended, we have to queue this io for later */
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
+ dm_put_live_table(md, srcu_idx);
+
+ if (bio_rw(bio) != READA)
+ queue_io(md, bio);
+ else
+ bio_io_error(bio);
+ return;
+ }
+
+ __split_and_process_bio(md, map, bio);
+ dm_put_live_table(md, srcu_idx);
+ return;
+}
+
+int dm_request_based(struct mapped_device *md)
+{
+ return blk_queue_stackable(md->queue);
+}
+
+static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
+{
+ int r;
+
+ if (blk_queue_io_stat(clone->q))
+ clone->cmd_flags |= REQ_IO_STAT;
+
+ clone->start_time = jiffies;
+ r = blk_insert_cloned_request(clone->q, clone);
+ if (r)
+ /* must complete clone in terms of original request */
+ dm_complete_request(rq, r);
+}
+
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+ void *data)
+{
+ struct dm_rq_target_io *tio = data;
+ struct dm_rq_clone_bio_info *info =
+ container_of(bio, struct dm_rq_clone_bio_info, clone);
+
+ info->orig = bio_orig;
+ info->tio = tio;
+ bio->bi_end_io = end_clone_bio;
+
+ return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+ int r;
+
+ r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+ dm_rq_bio_constructor, tio);
+ if (r)
+ return r;
+
+ clone->cmd = rq->cmd;
+ clone->cmd_len = rq->cmd_len;
+ clone->sense = rq->sense;
+ clone->end_io = end_clone_request;
+ clone->end_io_data = tio;
+
+ tio->clone = clone;
+
+ return 0;
+}
+
+static struct request *clone_rq(struct request *rq, struct mapped_device *md,
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+ /*
+ * Do not allocate a clone if tio->clone was already set
+ * (see: dm_mq_queue_rq).
+ */
+ bool alloc_clone = !tio->clone;
+ struct request *clone;
+
+ if (alloc_clone) {
+ clone = alloc_clone_request(md, gfp_mask);
+ if (!clone)
+ return NULL;
+ } else
+ clone = tio->clone;
+
+ blk_rq_init(NULL, clone);
+ if (setup_clone(clone, rq, tio, gfp_mask)) {
+ /* -ENOMEM */
+ if (alloc_clone)
+ free_clone_request(md, clone);
+ return NULL;
+ }
+
+ return clone;
+}
+
+static void map_tio_request(struct kthread_work *work);
+
+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
+ struct mapped_device *md)
+{
+ tio->md = md;
+ tio->ti = NULL;
+ tio->clone = NULL;
+ tio->orig = rq;
+ tio->error = 0;
+ memset(&tio->info, 0, sizeof(tio->info));
+ if (md->kworker_task)
+ init_kthread_work(&tio->work, map_tio_request);
+}
+
+static struct dm_rq_target_io *prep_tio(struct request *rq,
+ struct mapped_device *md, gfp_t gfp_mask)
+{
+ struct dm_rq_target_io *tio;
+ int srcu_idx;
+ struct dm_table *table;
+
+ tio = alloc_rq_tio(md, gfp_mask);
+ if (!tio)
+ return NULL;
+
+ init_tio(tio, rq, md);
+
+ table = dm_get_live_table(md, &srcu_idx);
+ if (!dm_table_mq_request_based(table)) {
+ if (!clone_rq(rq, md, tio, gfp_mask)) {
+ dm_put_live_table(md, srcu_idx);
+ free_rq_tio(tio);
+ return NULL;
+ }
+ }
+ dm_put_live_table(md, srcu_idx);
+
+ return tio;
+}
+
+/*
+ * Called with the queue lock held.
+ */
+static int dm_prep_fn(struct request_queue *q, struct request *rq)
+{
+ struct mapped_device *md = q->queuedata;
+ struct dm_rq_target_io *tio;
+
+ if (unlikely(rq->special)) {
+ DMWARN("Already has something in rq->special.");
+ return BLKPREP_KILL;
+ }
+
+ tio = prep_tio(rq, md, GFP_ATOMIC);
+ if (!tio)
+ return BLKPREP_DEFER;
+
+ rq->special = tio;
+ rq->cmd_flags |= REQ_DONTPREP;
+
+ return BLKPREP_OK;
+}
+
+/*
+ * Returns:
+ * 0 : the request has been processed
+ * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * < 0 : the request was completed due to failure
+ */
+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
+ struct mapped_device *md)
+{
+ int r;
+ struct dm_target *ti = tio->ti;
+ struct request *clone = NULL;
+
+ if (tio->clone) {
+ clone = tio->clone;
+ r = ti->type->map_rq(ti, clone, &tio->info);
+ } else {
+ r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+ if (r < 0) {
+ /* The target wants to complete the I/O */
+ dm_kill_unmapped_request(rq, r);
+ return r;
+ }
+ if (r != DM_MAPIO_REMAPPED)
+ return r;
+ if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+ /* -ENOMEM */
+ ti->type->release_clone_rq(clone);
+ return DM_MAPIO_REQUEUE;
+ }
+ }
+
+ switch (r) {
+ case DM_MAPIO_SUBMITTED:
+ /* The target has taken the I/O to submit by itself later */
+ break;
+ case DM_MAPIO_REMAPPED:
+ /* The target has remapped the I/O so dispatch it */
+ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
+ blk_rq_pos(rq));
+ dm_dispatch_clone_request(clone, rq);
+ break;
+ case DM_MAPIO_REQUEUE:
+ /* The target wants to requeue the I/O */
+ dm_requeue_unmapped_request(clone);
+ break;
+ default:
+ if (r > 0) {
+ DMWARN("unimplemented target map return value: %d", r);
+ BUG();
+ }
+
+ /* The target wants to complete the I/O */
+ dm_kill_unmapped_request(rq, r);
+ return r;
+ }
+
+ return 0;
+}
+
+static void map_tio_request(struct kthread_work *work)
+{
+ struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+ struct request *rq = tio->orig;
+ struct mapped_device *md = tio->md;
+
+ if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
+ dm_requeue_unmapped_original_request(md, rq);
+}
+
+static void dm_start_request(struct mapped_device *md, struct request *orig)
+{
+ if (!orig->q->mq_ops)
+ blk_start_request(orig);
+ else
+ blk_mq_start_request(orig);
+ atomic_inc(&md->pending[rq_data_dir(orig)]);
+
+ if (md->seq_rq_merge_deadline_usecs) {
+ md->last_rq_pos = rq_end_sector(orig);
+ md->last_rq_rw = rq_data_dir(orig);
+ md->last_rq_start_time = ktime_get();
+ }
+
+ /*
+ * Hold the md reference here for the in-flight I/O.
+ * We can't rely on the reference count by device opener,
+ * because the device may be closed during the request completion
+ * when all bios are completed.
+ * See the comment in rq_completed() too.
+ */
+ dm_get(md);
+}
+
+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+ return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+ const char *buf, size_t count)
+{
+ unsigned deadline;
+
+ if (!dm_request_based(md) || md->use_blk_mq)
+ return count;
+
+ if (kstrtouint(buf, 10, &deadline))
+ return -EINVAL;
+
+ if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
+ deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+
+ md->seq_rq_merge_deadline_usecs = deadline;
+
+ return count;
+}
+
+static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
+{
+ ktime_t kt_deadline;
+
+ if (!md->seq_rq_merge_deadline_usecs)
+ return false;
+
+ kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
+ kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+
+ return !ktime_after(ktime_get(), kt_deadline);
+}
+
+/*
+ * q->request_fn for request-based dm.
+ * Called with the queue lock held.
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+ struct mapped_device *md = q->queuedata;
+ int srcu_idx;
+ struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+ struct dm_target *ti;
+ struct request *rq;
+ struct dm_rq_target_io *tio;
+ sector_t pos;
+
+ /*
+ * For suspend, check blk_queue_stopped() and increment
+ * ->pending within a single queue_lock not to increment the
+ * number of in-flight I/Os after the queue is stopped in
+ * dm_suspend().
+ */
+ while (!blk_queue_stopped(q)) {
+ rq = blk_peek_request(q);
+ if (!rq)
+ goto out;
+
+ /* always use block 0 to find the target for flushes for now */
+ pos = 0;
+ if (!(rq->cmd_flags & REQ_FLUSH))
+ pos = blk_rq_pos(rq);
+
+ ti = dm_table_find_target(map, pos);
+ if (!dm_target_is_valid(ti)) {
+ /*
+ * Must perform setup, that rq_completed() requires,
+ * before calling dm_kill_unmapped_request
+ */
+ DMERR_LIMIT("request attempted access beyond the end of device");
+ dm_start_request(md, rq);
+ dm_kill_unmapped_request(rq, -EIO);
+ continue;
+ }
+
+ if (dm_request_peeked_before_merge_deadline(md) &&
+ md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+ md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
+ goto delay_and_out;
+
+ if (ti->type->busy && ti->type->busy(ti))
+ goto delay_and_out;
+
+ dm_start_request(md, rq);
+
+ tio = tio_from_request(rq);
+ /* Establish tio->ti before queuing work (map_tio_request) */
+ tio->ti = ti;
+ queue_kthread_work(&md->kworker, &tio->work);
+ BUG_ON(!irqs_disabled());
+ }
+
+ goto out;
+
+delay_and_out:
+ blk_delay_queue(q, HZ / 100);
+out:
+ dm_put_live_table(md, srcu_idx);
+}
+
+static int dm_any_congested(void *congested_data, int bdi_bits)
+{
+ int r = bdi_bits;
+ struct mapped_device *md = congested_data;
+ struct dm_table *map;
+
+ if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
+ map = dm_get_live_table_fast(md);
+ if (map) {
+ /*
+ * Request-based dm cares about only own queue for
+ * the query about congestion status of request_queue
+ */
+ if (dm_request_based(md))
+ r = md->queue->backing_dev_info.state &
+ bdi_bits;
+ else
+ r = dm_table_any_congested(map, bdi_bits);
+ }
+ dm_put_live_table_fast(md);
+ }
+
+ return r;
+}
+
+/*-----------------------------------------------------------------
+ * An IDR is used to keep track of allocated minor numbers.
+ *---------------------------------------------------------------*/
+static void free_minor(int minor)
+{
+ spin_lock(&_minor_lock);
+ idr_remove(&_minor_idr, minor);
+ spin_unlock(&_minor_lock);
+}
+
+/*
+ * See if the device with a specific minor # is free.
+ */
+static int specific_minor(int minor)
+{
+ int r;
+
+ if (minor >= (1 << MINORBITS))
+ return -EINVAL;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&_minor_lock);
+
+ r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
+
+ spin_unlock(&_minor_lock);
+ idr_preload_end();
+ if (r < 0)
+ return r == -ENOSPC ? -EBUSY : r;
+ return 0;
+}
+
+static int next_free_minor(int *minor)
+{
+ int r;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&_minor_lock);
+
+ r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
+
+ spin_unlock(&_minor_lock);
+ idr_preload_end();
+ if (r < 0)
+ return r;
+ *minor = r;
+ return 0;
+}
+
+static const struct block_device_operations dm_blk_dops;
+
+static void dm_wq_work(struct work_struct *work);
+
+static void dm_init_md_queue(struct mapped_device *md)
+{
+ /*
+ * Request-based dm devices cannot be stacked on top of bio-based dm
+ * devices. The type of this dm device may not have been decided yet.
+ * The type is decided at the first table loading time.
+ * To prevent problematic device stacking, clear the queue flag
+ * for request stacking support until then.
+ *
+ * This queue is new, so no concurrency on the queue_flags.
+ */
+ queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+}
+
+static void dm_init_old_md_queue(struct mapped_device *md)
+{
+ md->use_blk_mq = false;
+ dm_init_md_queue(md);
+
+ /*
+ * Initialize aspects of queue that aren't relevant for blk-mq
+ */
+ md->queue->queuedata = md;
+ md->queue->backing_dev_info.congested_fn = dm_any_congested;
+ md->queue->backing_dev_info.congested_data = md;
+
+ blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
+}
+
+/*
+ * Allocate and initialise a blank device with a given minor.
+ */
+static struct mapped_device *alloc_dev(int minor)
+{
+ int r;
+ struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
+ void *old_md;
+
+ if (!md) {
+ DMWARN("unable to allocate device, out of memory.");
+ return NULL;
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ goto bad_module_get;
+
+ /* get a minor number for the dev */
+ if (minor == DM_ANY_MINOR)
+ r = next_free_minor(&minor);
+ else
+ r = specific_minor(minor);
+ if (r < 0)
+ goto bad_minor;
+
+ r = init_srcu_struct(&md->io_barrier);
+ if (r < 0)
+ goto bad_io_barrier;
+
+ md->use_blk_mq = use_blk_mq;
+ md->type = DM_TYPE_NONE;
+ mutex_init(&md->suspend_lock);
+ mutex_init(&md->type_lock);
+ mutex_init(&md->table_devices_lock);
+ spin_lock_init(&md->deferred_lock);
+ atomic_set(&md->holders, 1);
+ atomic_set(&md->open_count, 0);
+ atomic_set(&md->event_nr, 0);
+ atomic_set(&md->uevent_seq, 0);
+ INIT_LIST_HEAD(&md->uevent_list);
+ INIT_LIST_HEAD(&md->table_devices);
+ spin_lock_init(&md->uevent_lock);
+
+ md->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!md->queue)
+ goto bad_queue;
+
+ dm_init_md_queue(md);
+
+ md->disk = alloc_disk(1);
+ if (!md->disk)
+ goto bad_disk;
+
+ atomic_set(&md->pending[0], 0);
+ atomic_set(&md->pending[1], 0);
+ init_waitqueue_head(&md->wait);
+ INIT_WORK(&md->work, dm_wq_work);
+ init_waitqueue_head(&md->eventq);
+ init_completion(&md->kobj_holder.completion);
+ md->kworker_task = NULL;
+
+ md->disk->major = _major;
+ md->disk->first_minor = minor;
+ md->disk->fops = &dm_blk_dops;
+ md->disk->queue = md->queue;
+ md->disk->private_data = md;
+ sprintf(md->disk->disk_name, "dm-%d", minor);
+ add_disk(md->disk);
+ format_dev_t(md->name, MKDEV(_major, minor));
+
+ md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
+ if (!md->wq)
+ goto bad_thread;
+
+ md->bdev = bdget_disk(md->disk, 0);
+ if (!md->bdev)
+ goto bad_bdev;
+
+ bio_init(&md->flush_bio);
+ md->flush_bio.bi_bdev = md->bdev;
+ md->flush_bio.bi_rw = WRITE_FLUSH;
+
+ dm_stats_init(&md->stats);
+
+ /* Populate the mapping, nobody knows we exist yet */
+ spin_lock(&_minor_lock);
+ old_md = idr_replace(&_minor_idr, md, minor);
+ spin_unlock(&_minor_lock);
+
+ BUG_ON(old_md != MINOR_ALLOCED);
+
+ return md;
+
+bad_bdev:
+ destroy_workqueue(md->wq);
+bad_thread:
+ del_gendisk(md->disk);
+ put_disk(md->disk);
+bad_disk:
+ blk_cleanup_queue(md->queue);
+bad_queue:
+ cleanup_srcu_struct(&md->io_barrier);
+bad_io_barrier:
+ free_minor(minor);
+bad_minor:
+ module_put(THIS_MODULE);
+bad_module_get:
+ kfree(md);
+ return NULL;
+}
+
+static void unlock_fs(struct mapped_device *md);
+
+static void free_dev(struct mapped_device *md)
+{
+ int minor = MINOR(disk_devt(md->disk));
+
+ unlock_fs(md);
+ destroy_workqueue(md->wq);
+
+ if (md->kworker_task)
+ kthread_stop(md->kworker_task);
+ if (md->io_pool)
+ mempool_destroy(md->io_pool);
+ if (md->rq_pool)
+ mempool_destroy(md->rq_pool);
+ if (md->bs)
+ bioset_free(md->bs);
+
+ cleanup_srcu_struct(&md->io_barrier);
+ free_table_devices(&md->table_devices);
+ dm_stats_cleanup(&md->stats);
+
+ spin_lock(&_minor_lock);
+ md->disk->private_data = NULL;
+ spin_unlock(&_minor_lock);
+ if (blk_get_integrity(md->disk))
+ blk_integrity_unregister(md->disk);
+ del_gendisk(md->disk);
+ put_disk(md->disk);
+ blk_cleanup_queue(md->queue);
+ if (md->use_blk_mq)
+ blk_mq_free_tag_set(&md->tag_set);
+ bdput(md->bdev);
+ free_minor(minor);
+
+ module_put(THIS_MODULE);
+ kfree(md);
+}
+
+static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
+{
+ struct dm_md_mempools *p = dm_table_get_md_mempools(t);
+
+ if (md->bs) {
+ /* The md already has necessary mempools. */
+ if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+ /*
+ * Reload bioset because front_pad may have changed
+ * because a different table was loaded.
+ */
+ bioset_free(md->bs);
+ md->bs = p->bs;
+ p->bs = NULL;
+ }
+ /*
+ * There's no need to reload with request-based dm
+ * because the size of front_pad doesn't change.
+ * Note for future: If you are to reload bioset,
+ * prep-ed requests in the queue may refer
+ * to bio from the old bioset, so you must walk
+ * through the queue to unprep.
+ */
+ goto out;
+ }
+
+ BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+
+ md->io_pool = p->io_pool;
+ p->io_pool = NULL;
+ md->rq_pool = p->rq_pool;
+ p->rq_pool = NULL;
+ md->bs = p->bs;
+ p->bs = NULL;
+
+out:
+ /* mempool bind completed, no longer need any mempools in the table */
+ dm_table_free_md_mempools(t);
+}
+
+/*
+ * Bind a table to the device.
+ */
+static void event_callback(void *context)
+{
+ unsigned long flags;
+ LIST_HEAD(uevents);
+ struct mapped_device *md = (struct mapped_device *) context;
+
+ spin_lock_irqsave(&md->uevent_lock, flags);
+ list_splice_init(&md->uevent_list, &uevents);
+ spin_unlock_irqrestore(&md->uevent_lock, flags);
+
+ dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
+
+ atomic_inc(&md->event_nr);
+ wake_up(&md->eventq);
+}
+
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
+static void __set_size(struct mapped_device *md, sector_t size)
+{
+ set_capacity(md->disk, size);
+
+ i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+}
+
+/*
+ * Return 1 if the queue has a compulsory merge_bvec_fn function.
+ *
+ * If this function returns 0, then the device is either a non-dm
+ * device without a merge_bvec_fn, or it is a dm device that is
+ * able to split any bios it receives that are too big.
+ */
+int dm_queue_merge_is_compulsory(struct request_queue *q)
+{
+ struct mapped_device *dev_md;
+
+ if (!q->merge_bvec_fn)
+ return 0;
+
+ if (q->make_request_fn == dm_make_request) {
+ dev_md = q->queuedata;
+ if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int dm_device_merge_is_compulsory(struct dm_target *ti,
+ struct dm_dev *dev, sector_t start,
+ sector_t len, void *data)
+{
+ struct block_device *bdev = dev->bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return dm_queue_merge_is_compulsory(q);
+}
+
+/*
+ * Return 1 if it is acceptable to ignore merge_bvec_fn based
+ * on the properties of the underlying devices.
+ */
+static int dm_table_merge_is_optional(struct dm_table *table)
+{
+ unsigned i = 0;
+ struct dm_target *ti;
+
+ while (i < dm_table_get_num_targets(table)) {
+ ti = dm_table_get_target(table, i++);
+
+ if (ti->type->iterate_devices &&
+ ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Returns old map, which caller must destroy.
+ */
+static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
+ struct queue_limits *limits)
+{
+ struct dm_table *old_map;
+ struct request_queue *q = md->queue;
+ sector_t size;
+ int merge_is_optional;
+
+ size = dm_table_get_size(t);
+
+ /*
+ * Wipe any geometry if the size of the table changed.
+ */
+ if (size != dm_get_size(md))
+ memset(&md->geometry, 0, sizeof(md->geometry));
+
+ __set_size(md, size);
+
+ dm_table_event_callback(t, event_callback, md);
+
+ /*
+ * The queue hasn't been stopped yet, if the old table type wasn't
+ * for request-based during suspension. So stop it to prevent
+ * I/O mapping before resume.
+ * This must be done before setting the queue restrictions,
+ * because request-based dm may be run just after the setting.
+ */
+ if (dm_table_request_based(t))
+ stop_queue(q);
+
+ __bind_mempools(md, t);
+
+ merge_is_optional = dm_table_merge_is_optional(t);
+
+ old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ rcu_assign_pointer(md->map, t);
+ md->immutable_target_type = dm_table_get_immutable_target_type(t);
+
+ dm_table_set_restrictions(t, q, limits);
+ if (merge_is_optional)
+ set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
+ else
+ clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
+ if (old_map)
+ dm_sync_table(md);
+
+ return old_map;
+}
+
+/*
+ * Returns unbound table for the caller to free.
+ */
+static struct dm_table *__unbind(struct mapped_device *md)
+{
+ struct dm_table *map = rcu_dereference_protected(md->map, 1);
+
+ if (!map)
+ return NULL;
+
+ dm_table_event_callback(map, NULL, NULL);
+ RCU_INIT_POINTER(md->map, NULL);
+ dm_sync_table(md);
+
+ return map;
+}
+
+/*
+ * Constructor for a new device.
+ */
+int dm_create(int minor, struct mapped_device **result)
+{
+ struct mapped_device *md;
+
+ md = alloc_dev(minor);
+ if (!md)
+ return -ENXIO;
+
+ dm_sysfs_init(md);
+
+ *result = md;
+ return 0;
+}
+
+/*
+ * Functions to manage md->type.
+ * All are required to hold md->type_lock.
+ */
+void dm_lock_md_type(struct mapped_device *md)
+{
+ mutex_lock(&md->type_lock);
+}
+
+void dm_unlock_md_type(struct mapped_device *md)
+{
+ mutex_unlock(&md->type_lock);
+}
+
+void dm_set_md_type(struct mapped_device *md, unsigned type)
+{
+ BUG_ON(!mutex_is_locked(&md->type_lock));
+ md->type = type;
+}
+
+unsigned dm_get_md_type(struct mapped_device *md)
+{
+ BUG_ON(!mutex_is_locked(&md->type_lock));
+ return md->type;
+}
+
+struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
+{
+ return md->immutable_target_type;
+}
+
+/*
+ * The queue_limits are only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
+{
+ BUG_ON(!atomic_read(&md->holders));
+ return &md->queue->limits;
+}
+EXPORT_SYMBOL_GPL(dm_get_queue_limits);
+
+static void init_rq_based_worker_thread(struct mapped_device *md)
+{
+ /* Initialize the request-based DM worker thread */
+ init_kthread_worker(&md->kworker);
+ md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+ "kdmwork-%s", dm_device_name(md));
+}
+
+/*
+ * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
+ */
+static int dm_init_request_based_queue(struct mapped_device *md)
+{
+ struct request_queue *q = NULL;
+
+ /* Fully initialize the queue */
+ q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
+ if (!q)
+ return -EINVAL;
+
+ /* disable dm_request_fn's merge heuristic by default */
+ md->seq_rq_merge_deadline_usecs = 0;
+
+ md->queue = q;
+ dm_init_old_md_queue(md);
+ blk_queue_softirq_done(md->queue, dm_softirq_done);
+ blk_queue_prep_rq(md->queue, dm_prep_fn);
+
+ init_rq_based_worker_thread(md);
+
+ elv_register_queue(md->queue);
+
+ return 0;
+}
+
+static int dm_mq_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct mapped_device *md = data;
+ struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+ /*
+ * Must initialize md member of tio, otherwise it won't
+ * be available in dm_mq_queue_rq.
+ */
+ tio->md = md;
+
+ return 0;
+}
+
+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+ struct mapped_device *md = tio->md;
+ int srcu_idx;
+ struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+ struct dm_target *ti;
+ sector_t pos;
+
+ /* always use block 0 to find the target for flushes for now */
+ pos = 0;
+ if (!(rq->cmd_flags & REQ_FLUSH))
+ pos = blk_rq_pos(rq);
+
+ ti = dm_table_find_target(map, pos);
+ if (!dm_target_is_valid(ti)) {
+ dm_put_live_table(md, srcu_idx);
+ DMERR_LIMIT("request attempted access beyond the end of device");
+ /*
+ * Must perform setup, that rq_completed() requires,
+ * before returning BLK_MQ_RQ_QUEUE_ERROR
+ */
+ dm_start_request(md, rq);
+ return BLK_MQ_RQ_QUEUE_ERROR;
+ }
+ dm_put_live_table(md, srcu_idx);
+
+ if (ti->type->busy && ti->type->busy(ti))
+ return BLK_MQ_RQ_QUEUE_BUSY;
+
+ dm_start_request(md, rq);
+
+ /* Init tio using md established in .init_request */
+ init_tio(tio, rq, md);
+
+ /*
+ * Establish tio->ti before queuing work (map_tio_request)
+ * or making direct call to map_request().
+ */
+ tio->ti = ti;
+
+ /* Clone the request if underlying devices aren't blk-mq */
+ if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
+ /* clone request is allocated at the end of the pdu */
+ tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
+ (void) clone_rq(rq, md, tio, GFP_ATOMIC);
+ queue_kthread_work(&md->kworker, &tio->work);
+ } else {
+ /* Direct call is fine since .queue_rq allows allocations */
+ if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+ /* Undo dm_start_request() before requeuing */
+ rq_completed(md, rq_data_dir(rq), false);
+ return BLK_MQ_RQ_QUEUE_BUSY;
+ }
+ }
+
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static struct blk_mq_ops dm_mq_ops = {
+ .queue_rq = dm_mq_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .complete = dm_softirq_done,
+ .init_request = dm_mq_init_request,
+};
+
+static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+{
+ unsigned md_type = dm_get_md_type(md);
+ struct request_queue *q;
+ int err;
+
+ memset(&md->tag_set, 0, sizeof(md->tag_set));
+ md->tag_set.ops = &dm_mq_ops;
+ md->tag_set.queue_depth = BLKDEV_MAX_RQ;
+ md->tag_set.numa_node = NUMA_NO_NODE;
+ md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ md->tag_set.nr_hw_queues = 1;
+ if (md_type == DM_TYPE_REQUEST_BASED) {
+ /* make the memory for non-blk-mq clone part of the pdu */
+ md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
+ } else
+ md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
+ md->tag_set.driver_data = md;
+
+ err = blk_mq_alloc_tag_set(&md->tag_set);
+ if (err)
+ return err;
+
+ q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+ if (IS_ERR(q)) {
+ err = PTR_ERR(q);
+ goto out_tag_set;
+ }
+ md->queue = q;
+ dm_init_md_queue(md);
+
+ /* backfill 'mq' sysfs registration normally done in blk_register_queue */
+ blk_mq_register_disk(md->disk);
+
+ if (md_type == DM_TYPE_REQUEST_BASED)
+ init_rq_based_worker_thread(md);
+
+ return 0;
+
+out_tag_set:
+ blk_mq_free_tag_set(&md->tag_set);
+ return err;
+}
+
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+ if (type == DM_TYPE_BIO_BASED)
+ return type;
+
+ return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
+/*
+ * Setup the DM device's queue based on md's type
+ */
+int dm_setup_md_queue(struct mapped_device *md)
+{
+ int r;
+ unsigned md_type = filter_md_type(dm_get_md_type(md), md);
+
+ switch (md_type) {
+ case DM_TYPE_REQUEST_BASED:
+ r = dm_init_request_based_queue(md);
+ if (r) {
+ DMWARN("Cannot initialize queue for request-based mapped device");
+ return r;
+ }
+ break;
+ case DM_TYPE_MQ_REQUEST_BASED:
+ r = dm_init_request_based_blk_mq_queue(md);
+ if (r) {
+ DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+ return r;
+ }
+ break;
+ case DM_TYPE_BIO_BASED:
+ dm_init_old_md_queue(md);
+ blk_queue_make_request(md->queue, dm_make_request);
+ blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+ break;
+ }
+
+ return 0;
+}
+
+struct mapped_device *dm_get_md(dev_t dev)
+{
+ struct mapped_device *md;
+ unsigned minor = MINOR(dev);
+
+ if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
+ return NULL;
+
+ spin_lock(&_minor_lock);
+
+ md = idr_find(&_minor_idr, minor);
+ if (md) {
+ if ((md == MINOR_ALLOCED ||
+ (MINOR(disk_devt(dm_disk(md))) != minor) ||
+ dm_deleting_md(md) ||
+ test_bit(DMF_FREEING, &md->flags))) {
+ md = NULL;
+ goto out;
+ }
+ dm_get(md);
+ }
+
+out:
+ spin_unlock(&_minor_lock);
+
+ return md;
+}
+EXPORT_SYMBOL_GPL(dm_get_md);
+
+void *dm_get_mdptr(struct mapped_device *md)
+{
+ return md->interface_ptr;
+}
+
+void dm_set_mdptr(struct mapped_device *md, void *ptr)
+{
+ md->interface_ptr = ptr;
+}
+
+void dm_get(struct mapped_device *md)
+{
+ atomic_inc(&md->holders);
+ BUG_ON(test_bit(DMF_FREEING, &md->flags));
+}
+
+int dm_hold(struct mapped_device *md)
+{
+ spin_lock(&_minor_lock);
+ if (test_bit(DMF_FREEING, &md->flags)) {
+ spin_unlock(&_minor_lock);
+ return -EBUSY;
+ }
+ dm_get(md);
+ spin_unlock(&_minor_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_hold);
+
+const char *dm_device_name(struct mapped_device *md)
+{
+ return md->name;
+}
+EXPORT_SYMBOL_GPL(dm_device_name);
+
+static void __dm_destroy(struct mapped_device *md, bool wait)
+{
+ struct dm_table *map;
+ int srcu_idx;
+
+ might_sleep();
+
+ map = dm_get_live_table(md, &srcu_idx);
+
+ spin_lock(&_minor_lock);
+ idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
+ set_bit(DMF_FREEING, &md->flags);
+ spin_unlock(&_minor_lock);
+
+ if (dm_request_based(md) && md->kworker_task)
+ flush_kthread_worker(&md->kworker);
+
+ /*
+ * Take suspend_lock so that presuspend and postsuspend methods
+ * do not race with internal suspend.
+ */
+ mutex_lock(&md->suspend_lock);
+ if (!dm_suspended_md(md)) {
+ dm_table_presuspend_targets(map);
+ dm_table_postsuspend_targets(map);
+ }
+ mutex_unlock(&md->suspend_lock);
+
+ /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
+ dm_put_live_table(md, srcu_idx);
+
+ /*
+ * Rare, but there may be I/O requests still going to complete,
+ * for example. Wait for all references to disappear.
+ * No one should increment the reference count of the mapped_device,
+ * after the mapped_device state becomes DMF_FREEING.
+ */
+ if (wait)
+ while (atomic_read(&md->holders))
+ msleep(1);
+ else if (atomic_read(&md->holders))
+ DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
+ dm_device_name(md), atomic_read(&md->holders));
+
+ dm_sysfs_exit(md);
+ dm_table_destroy(__unbind(md));
+ free_dev(md);
+}
+
+void dm_destroy(struct mapped_device *md)
+{
+ __dm_destroy(md, true);
+}
+
+void dm_destroy_immediate(struct mapped_device *md)
+{
+ __dm_destroy(md, false);
+}
+
+void dm_put(struct mapped_device *md)
+{
+ atomic_dec(&md->holders);
+}
+EXPORT_SYMBOL_GPL(dm_put);
+
+static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
+{
+ int r = 0;
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&md->wait, &wait);
+
+ while (1) {
+ set_current_state(interruptible);
+
+ if (!md_in_flight(md))
+ break;
+
+ if (interruptible == TASK_INTERRUPTIBLE &&
+ signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ io_schedule();
+ }
+ set_current_state(TASK_RUNNING);
+
+ remove_wait_queue(&md->wait, &wait);
+
+ return r;
+}
+
+/*
+ * Process the deferred bios
+ */
+static void dm_wq_work(struct work_struct *work)
+{
+ struct mapped_device *md = container_of(work, struct mapped_device,
+ work);
+ struct bio *c;
+ int srcu_idx;
+ struct dm_table *map;
+
+ map = dm_get_live_table(md, &srcu_idx);
+
+ while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
+ spin_lock_irq(&md->deferred_lock);
+ c = bio_list_pop(&md->deferred);
+ spin_unlock_irq(&md->deferred_lock);
+
+ if (!c)
+ break;
+
+ if (dm_request_based(md))
+ generic_make_request(c);
+ else
+ __split_and_process_bio(md, map, c);
+ }
+
+ dm_put_live_table(md, srcu_idx);
+}
+
+static void dm_queue_flush(struct mapped_device *md)
+{
+ clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ smp_mb__after_atomic();
+ queue_work(md->wq, &md->work);
+}
+
+/*
+ * Swap in a new table, returning the old one for the caller to destroy.
+ */
+struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
+{
+ struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
+ struct queue_limits limits;
+ int r;
+
+ mutex_lock(&md->suspend_lock);
+
+ /* device must be suspended */
+ if (!dm_suspended_md(md))
+ goto out;
+
+ /*
+ * If the new table has no data devices, retain the existing limits.
+ * This helps multipath with queue_if_no_path if all paths disappear,
+ * then new I/O is queued based on these limits, and then some paths
+ * reappear.
+ */
+ if (dm_table_has_no_data_devices(table)) {
+ live_map = dm_get_live_table_fast(md);
+ if (live_map)
+ limits = md->queue->limits;
+ dm_put_live_table_fast(md);
+ }
+
+ if (!live_map) {
+ r = dm_calculate_queue_limits(table, &limits);
+ if (r) {
+ map = ERR_PTR(r);
+ goto out;
+ }
+ }
+
+ map = __bind(md, table, &limits);
+
+out:
+ mutex_unlock(&md->suspend_lock);
+ return map;
+}
+
+/*
+ * Functions to lock and unlock any filesystem running on the
+ * device.
+ */
+static int lock_fs(struct mapped_device *md)
+{
+ int r;
+
+ WARN_ON(md->frozen_sb);
+
+ md->frozen_sb = freeze_bdev(md->bdev);
+ if (IS_ERR(md->frozen_sb)) {
+ r = PTR_ERR(md->frozen_sb);
+ md->frozen_sb = NULL;
+ return r;
+ }
+
+ set_bit(DMF_FROZEN, &md->flags);
+
+ return 0;
+}
+
+static void unlock_fs(struct mapped_device *md)
+{
+ if (!test_bit(DMF_FROZEN, &md->flags))
+ return;
+
+ thaw_bdev(md->bdev, md->frozen_sb);
+ md->frozen_sb = NULL;
+ clear_bit(DMF_FROZEN, &md->flags);
+}
+
+/*
+ * If __dm_suspend returns 0, the device is completely quiescent
+ * now. There is no request-processing activity. All new requests
+ * are being added to md->deferred list.
+ *
+ * Caller must hold md->suspend_lock
+ */
+static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
+ unsigned suspend_flags, int interruptible)
+{
+ bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
+ bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
+ int r;
+
+ /*
+ * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
+ * This flag is cleared before dm_suspend returns.
+ */
+ if (noflush)
+ set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+
+ /*
+ * This gets reverted if there's an error later and the targets
+ * provide the .presuspend_undo hook.
+ */
+ dm_table_presuspend_targets(map);
+
+ /*
+ * Flush I/O to the device.
+ * Any I/O submitted after lock_fs() may not be flushed.
+ * noflush takes precedence over do_lockfs.
+ * (lock_fs() flushes I/Os and waits for them to complete.)
+ */
+ if (!noflush && do_lockfs) {
+ r = lock_fs(md);
+ if (r) {
+ dm_table_presuspend_undo_targets(map);
+ return r;
+ }
+ }
+
+ /*
+ * Here we must make sure that no processes are submitting requests
+ * to target drivers i.e. no one may be executing
+ * __split_and_process_bio. This is called from dm_request and
+ * dm_wq_work.
+ *
+ * To get all processes out of __split_and_process_bio in dm_request,
+ * we take the write lock. To prevent any process from reentering
+ * __split_and_process_bio from dm_request and quiesce the thread
+ * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
+ * flush_workqueue(md->wq).
+ */
+ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ if (map)
+ synchronize_srcu(&md->io_barrier);
+
+ /*
+ * Stop md->queue before flushing md->wq in case request-based
+ * dm defers requests to md->wq from md->queue.
+ */
+ if (dm_request_based(md)) {
+ stop_queue(md->queue);
+ if (md->kworker_task)
+ flush_kthread_worker(&md->kworker);
+ }
+
+ flush_workqueue(md->wq);
+
+ /*
+ * At this point no more requests are entering target request routines.
+ * We call dm_wait_for_completion to wait for all existing requests
+ * to finish.
+ */
+ r = dm_wait_for_completion(md, interruptible);
+
+ if (noflush)
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ if (map)
+ synchronize_srcu(&md->io_barrier);
+
+ /* were we interrupted ? */
+ if (r < 0) {
+ dm_queue_flush(md);
+
+ if (dm_request_based(md))
+ start_queue(md->queue);
+
+ unlock_fs(md);
+ dm_table_presuspend_undo_targets(map);
+ /* pushback list is already flushed, so skip flush */
+ }
+
+ return r;
+}
+
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem. For example we might want to move some data in
+ * the background. Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight bios and ensure that any further io gets deferred.
+ */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * 1. Flush all I/Os by lock_fs() if needed.
+ * 2. Stop dispatching any I/O by stopping the request_queue.
+ * 3. Wait for all in-flight I/Os to be completed or requeued.
+ *
+ * To abort suspend, start the request_queue.
+ */
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+{
+ struct dm_table *map = NULL;
+ int r = 0;
+
+retry:
+ mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+ if (dm_suspended_md(md)) {
+ r = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (dm_suspended_internally_md(md)) {
+ /* already internally suspended, wait for internal resume */
+ mutex_unlock(&md->suspend_lock);
+ r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+ if (r)
+ return r;
+ goto retry;
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+ r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
+ if (r)
+ goto out_unlock;
+
+ set_bit(DMF_SUSPENDED, &md->flags);
+
+ dm_table_postsuspend_targets(map);
+
+out_unlock:
+ mutex_unlock(&md->suspend_lock);
+ return r;
+}
+
+static int __dm_resume(struct mapped_device *md, struct dm_table *map)
+{
+ if (map) {
+ int r = dm_table_resume_targets(map);
+ if (r)
+ return r;
+ }
+
+ dm_queue_flush(md);
+
+ /*
+ * Flushing deferred I/Os must be done after targets are resumed
+ * so that mapping of targets can work correctly.
+ * Request-based dm is queueing the deferred I/Os in its request_queue.
+ */
+ if (dm_request_based(md))
+ start_queue(md->queue);
+
+ unlock_fs(md);
+
+ return 0;
+}
+
+int dm_resume(struct mapped_device *md)
+{
+ int r = -EINVAL;
+ struct dm_table *map = NULL;
+
+retry:
+ mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+ if (!dm_suspended_md(md))
+ goto out;
+
+ if (dm_suspended_internally_md(md)) {
+ /* already internally suspended, wait for internal resume */
+ mutex_unlock(&md->suspend_lock);
+ r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+ if (r)
+ return r;
+ goto retry;
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ r = __dm_resume(md, map);
+ if (r)
+ goto out;
+
+ clear_bit(DMF_SUSPENDED, &md->flags);
+
+ r = 0;
+out:
+ mutex_unlock(&md->suspend_lock);
+
+ return r;
+}
+
+/*
+ * Internal suspend/resume works like userspace-driven suspend. It waits
+ * until all bios finish and prevents issuing new bios to the target drivers.
+ * It may be used only from the kernel.
+ */
+
+static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
+{
+ struct dm_table *map = NULL;
+
+ if (md->internal_suspend_count++)
+ return; /* nested internal suspend */
+
+ if (dm_suspended_md(md)) {
+ set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+ return; /* nest suspend */
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+ /*
+ * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
+ * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
+ * would require changing .presuspend to return an error -- avoid this
+ * until there is a need for more elaborate variants of internal suspend.
+ */
+ (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
+
+ set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+
+ dm_table_postsuspend_targets(map);
+}
+
+static void __dm_internal_resume(struct mapped_device *md)
+{
+ BUG_ON(!md->internal_suspend_count);
+
+ if (--md->internal_suspend_count)
+ return; /* resume from nested internal suspend */
+
+ if (dm_suspended_md(md))
+ goto done; /* resume from nested suspend */
+
+ /*
+ * NOTE: existing callers don't need to call dm_table_resume_targets
+ * (which may fail -- so best to avoid it for now by passing NULL map)
+ */
+ (void) __dm_resume(md, NULL);
+
+done:
+ clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
+}
+
+void dm_internal_suspend_noflush(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
+
+void dm_internal_resume(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ __dm_internal_resume(md);
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_resume);
+
+/*
+ * Fast variants of internal suspend/resume hold md->suspend_lock,
+ * which prevents interaction with userspace-driven suspend.
+ */
+
+void dm_internal_suspend_fast(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ if (dm_suspended_md(md) || dm_suspended_internally_md(md))
+ return;
+
+ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ synchronize_srcu(&md->io_barrier);
+ flush_workqueue(md->wq);
+ dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
+
+void dm_internal_resume_fast(struct mapped_device *md)
+{
+ if (dm_suspended_md(md) || dm_suspended_internally_md(md))
+ goto done;
+
+ dm_queue_flush(md);
+
+done:
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
+
+/*-----------------------------------------------------------------
+ * Event notification.
+ *---------------------------------------------------------------*/
+int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+ unsigned cookie)
+{
+ char udev_cookie[DM_COOKIE_LENGTH];
+ char *envp[] = { udev_cookie, NULL };
+
+ if (!cookie)
+ return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
+ else {
+ snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
+ DM_COOKIE_ENV_VAR_NAME, cookie);
+ return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
+ action, envp);
+ }
+}
+
+uint32_t dm_next_uevent_seq(struct mapped_device *md)
+{
+ return atomic_add_return(1, &md->uevent_seq);
+}
+
+uint32_t dm_get_event_nr(struct mapped_device *md)
+{
+ return atomic_read(&md->event_nr);
+}
+
+int dm_wait_event(struct mapped_device *md, int event_nr)
+{
+ return wait_event_interruptible(md->eventq,
+ (event_nr != atomic_read(&md->event_nr)));
+}
+
+void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&md->uevent_lock, flags);
+ list_add(elist, &md->uevent_list);
+ spin_unlock_irqrestore(&md->uevent_lock, flags);
+}
+
+/*
+ * The gendisk is only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct gendisk *dm_disk(struct mapped_device *md)
+{
+ return md->disk;
+}
+EXPORT_SYMBOL_GPL(dm_disk);
+
+struct kobject *dm_kobject(struct mapped_device *md)
+{
+ return &md->kobj_holder.kobj;
+}
+
+struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
+{
+ struct mapped_device *md;
+
+ md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
+
+ if (test_bit(DMF_FREEING, &md->flags) ||
+ dm_deleting_md(md))
+ return NULL;
+
+ dm_get(md);
+ return md;
+}
+
+int dm_suspended_md(struct mapped_device *md)
+{
+ return test_bit(DMF_SUSPENDED, &md->flags);
+}
+
+int dm_suspended_internally_md(struct mapped_device *md)
+{
+ return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+}
+
+int dm_test_deferred_remove_flag(struct mapped_device *md)
+{
+ return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
+}
+
+int dm_suspended(struct dm_target *ti)
+{
+ return dm_suspended_md(dm_table_get_md(ti->table));
+}
+EXPORT_SYMBOL_GPL(dm_suspended);
+
+int dm_noflush_suspending(struct dm_target *ti)
+{
+ return __noflush_suspending(dm_table_get_md(ti->table));
+}
+EXPORT_SYMBOL_GPL(dm_noflush_suspending);
+
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+ unsigned integrity, unsigned per_bio_data_size)
+{
+ struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+ struct kmem_cache *cachep = NULL;
+ unsigned int pool_size = 0;
+ unsigned int front_pad;
+
+ if (!pools)
+ return NULL;
+
+ type = filter_md_type(type, md);
+
+ switch (type) {
+ case DM_TYPE_BIO_BASED:
+ cachep = _io_cache;
+ pool_size = dm_get_reserved_bio_based_ios();
+ front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+ break;
+ case DM_TYPE_REQUEST_BASED:
+ cachep = _rq_tio_cache;
+ pool_size = dm_get_reserved_rq_based_ios();
+ pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+ if (!pools->rq_pool)
+ goto out;
+ /* fall through to setup remaining rq-based pools */
+ case DM_TYPE_MQ_REQUEST_BASED:
+ if (!pool_size)
+ pool_size = dm_get_reserved_rq_based_ios();
+ front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
+ /* per_bio_data_size is not used. See __bind_mempools(). */
+ WARN_ON(per_bio_data_size != 0);
+ break;
+ default:
+ BUG();
+ }
+
+ if (cachep) {
+ pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+ if (!pools->io_pool)
+ goto out;
+ }
+
+ pools->bs = bioset_create_nobvec(pool_size, front_pad);
+ if (!pools->bs)
+ goto out;
+
+ if (integrity && bioset_integrity_create(pools->bs, pool_size))
+ goto out;
+
+ return pools;
+
+out:
+ dm_free_md_mempools(pools);
+
+ return NULL;
+}
+
+void dm_free_md_mempools(struct dm_md_mempools *pools)
+{
+ if (!pools)
+ return;
+
+ if (pools->io_pool)
+ mempool_destroy(pools->io_pool);
+
+ if (pools->rq_pool)
+ mempool_destroy(pools->rq_pool);
+
+ if (pools->bs)
+ bioset_free(pools->bs);
+
+ kfree(pools);
+}
+
+static const struct block_device_operations dm_blk_dops = {
+ .open = dm_blk_open,
+ .release = dm_blk_close,
+ .ioctl = dm_blk_ioctl,
+ .getgeo = dm_blk_getgeo,
+ .owner = THIS_MODULE
+};
+
+/*
+ * module hooks
+ */
+module_init(dm_init);
+module_exit(dm_exit);
+
+module_param(major, uint, 0);
+MODULE_PARM_DESC(major, "The major number of the device mapper");
+
+module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
+
+module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+
+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+
+MODULE_DESCRIPTION(DM_NAME " driver");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
new file mode 100644
index 000000000..6123c2bf9
--- /dev/null
+++ b/drivers/md/dm.h
@@ -0,0 +1,244 @@
+/*
+ * Internal header file for device mapper
+ *
+ * Copyright (C) 2001, 2002 Sistina Software
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_INTERNAL_H
+#define DM_INTERNAL_H
+
+#include <linux/fs.h>
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/completion.h>
+#include <linux/kobject.h>
+
+#include "dm-stats.h"
+
+/*
+ * Suspend feature flags
+ */
+#define DM_SUSPEND_LOCKFS_FLAG (1 << 0)
+#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
+
+/*
+ * Status feature flags
+ */
+#define DM_STATUS_NOFLUSH_FLAG (1 << 0)
+
+/*
+ * Type of table and mapped_device's mempool
+ */
+#define DM_TYPE_NONE 0
+#define DM_TYPE_BIO_BASED 1
+#define DM_TYPE_REQUEST_BASED 2
+#define DM_TYPE_MQ_REQUEST_BASED 3
+
+/*
+ * List of devices that a metadevice uses and should open/close.
+ */
+struct dm_dev_internal {
+ struct list_head list;
+ atomic_t count;
+ struct dm_dev *dm_dev;
+};
+
+struct dm_table;
+struct dm_md_mempools;
+
+/*-----------------------------------------------------------------
+ * Internal table functions.
+ *---------------------------------------------------------------*/
+void dm_table_destroy(struct dm_table *t);
+void dm_table_event_callback(struct dm_table *t,
+ void (*fn)(void *), void *context);
+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
+bool dm_table_has_no_data_devices(struct dm_table *table);
+int dm_calculate_queue_limits(struct dm_table *table,
+ struct queue_limits *limits);
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+ struct queue_limits *limits);
+struct list_head *dm_table_get_devices(struct dm_table *t);
+void dm_table_presuspend_targets(struct dm_table *t);
+void dm_table_presuspend_undo_targets(struct dm_table *t);
+void dm_table_postsuspend_targets(struct dm_table *t);
+int dm_table_resume_targets(struct dm_table *t);
+int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+unsigned dm_table_get_type(struct dm_table *t);
+struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
+bool dm_table_request_based(struct dm_table *t);
+bool dm_table_mq_request_based(struct dm_table *t);
+void dm_table_free_md_mempools(struct dm_table *t);
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+
+int dm_queue_merge_is_compulsory(struct request_queue *q);
+
+void dm_lock_md_type(struct mapped_device *md);
+void dm_unlock_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, unsigned type);
+unsigned dm_get_md_type(struct mapped_device *md);
+struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
+
+int dm_setup_md_queue(struct mapped_device *md);
+
+/*
+ * To check the return value from dm_table_find_target().
+ */
+#define dm_target_is_valid(t) ((t)->table)
+
+/*
+ * To check whether the target type is bio-based or not (request-based).
+ */
+#define dm_target_bio_based(t) ((t)->type->map != NULL)
+
+/*
+ * To check whether the target type is request-based or not (bio-based).
+ */
+#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \
+ ((t)->type->clone_and_map_rq != NULL))
+
+/*
+ * To check whether the target type is a hybrid (capable of being
+ * either request-based or bio-based).
+ */
+#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t))
+
+/*-----------------------------------------------------------------
+ * A registry of target types.
+ *---------------------------------------------------------------*/
+int dm_target_init(void);
+void dm_target_exit(void);
+struct target_type *dm_get_target_type(const char *name);
+void dm_put_target_type(struct target_type *tt);
+int dm_target_iterate(void (*iter_func)(struct target_type *tt,
+ void *param), void *param);
+
+int dm_split_args(int *argc, char ***argvp, char *input);
+
+/*
+ * Is this mapped_device being deleted?
+ */
+int dm_deleting_md(struct mapped_device *md);
+
+/*
+ * Is this mapped_device suspended?
+ */
+int dm_suspended_md(struct mapped_device *md);
+
+/*
+ * Internal suspend and resume methods.
+ */
+int dm_suspended_internally_md(struct mapped_device *md);
+void dm_internal_suspend_fast(struct mapped_device *md);
+void dm_internal_resume_fast(struct mapped_device *md);
+void dm_internal_suspend_noflush(struct mapped_device *md);
+void dm_internal_resume(struct mapped_device *md);
+
+/*
+ * Test if the device is scheduled for deferred remove.
+ */
+int dm_test_deferred_remove_flag(struct mapped_device *md);
+
+/*
+ * Try to remove devices marked for deferred removal.
+ */
+void dm_deferred_remove(void);
+
+/*
+ * The device-mapper can be driven through one of two interfaces;
+ * ioctl or filesystem, depending which patch you have applied.
+ */
+int dm_interface_init(void);
+void dm_interface_exit(void);
+
+/*
+ * sysfs interface
+ */
+struct dm_kobject_holder {
+ struct kobject kobj;
+ struct completion completion;
+};
+
+static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+ return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
+}
+
+int dm_sysfs_init(struct mapped_device *md);
+void dm_sysfs_exit(struct mapped_device *md);
+struct kobject *dm_kobject(struct mapped_device *md);
+struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
+
+/*
+ * The kobject helper
+ */
+void dm_kobject_release(struct kobject *kobj);
+
+/*
+ * Targets for linear and striped mappings
+ */
+int dm_linear_init(void);
+void dm_linear_exit(void);
+
+int dm_stripe_init(void);
+void dm_stripe_exit(void);
+
+/*
+ * mapped_device operations
+ */
+void dm_destroy(struct mapped_device *md);
+void dm_destroy_immediate(struct mapped_device *md);
+int dm_open_count(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
+int dm_cancel_deferred_remove(struct mapped_device *md);
+int dm_request_based(struct mapped_device *md);
+sector_t dm_get_size(struct mapped_device *md);
+struct request_queue *dm_get_md_queue(struct mapped_device *md);
+int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
+ struct dm_dev **result);
+void dm_put_table_device(struct mapped_device *md, struct dm_dev *d);
+struct dm_stats *dm_get_stats(struct mapped_device *md);
+
+int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+ unsigned cookie);
+
+void dm_internal_suspend(struct mapped_device *md);
+void dm_internal_resume(struct mapped_device *md);
+
+bool dm_use_blk_mq(struct mapped_device *md);
+
+int dm_io_init(void);
+void dm_io_exit(void);
+
+int dm_kcopyd_init(void);
+void dm_kcopyd_exit(void);
+
+/*
+ * Mempool operations
+ */
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+ unsigned integrity, unsigned per_bio_data_size);
+void dm_free_md_mempools(struct dm_md_mempools *pools);
+
+/*
+ * Helpers that are used by DM core
+ */
+unsigned dm_get_reserved_bio_based_ios(void);
+unsigned dm_get_reserved_rq_based_ios(void);
+
+static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
+{
+ return !maxlen || strlen(result) + 1 >= maxlen;
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+ const char *buf, size_t count);
+
+#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
new file mode 100644
index 000000000..1277eb26b
--- /dev/null
+++ b/drivers/md/faulty.c
@@ -0,0 +1,371 @@
+/*
+ * faulty.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 2004 Neil Brown
+ *
+ * fautly-device-simulator personality for md
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+/*
+ * The "faulty" personality causes some requests to fail.
+ *
+ * Possible failure modes are:
+ * reads fail "randomly" but succeed on retry
+ * writes fail "randomly" but succeed on retry
+ * reads for some address fail and then persist until a write
+ * reads for some address fail and then persist irrespective of write
+ * writes for some address fail and persist
+ * all writes fail
+ *
+ * Different modes can be active at a time, but only
+ * one can be set at array creation. Others can be added later.
+ * A mode can be one-shot or recurrent with the recurrence being
+ * once in every N requests.
+ * The bottom 5 bits of the "layout" indicate the mode. The
+ * remainder indicate a period, or 0 for one-shot.
+ *
+ * There is an implementation limit on the number of concurrently
+ * persisting-faulty blocks. When a new fault is requested that would
+ * exceed the limit, it is ignored.
+ * All current faults can be clear using a layout of "0".
+ *
+ * Requests are always sent to the device. If they are to fail,
+ * we clone the bio and insert a new b_end_io into the chain.
+ */
+
+#define WriteTransient 0
+#define ReadTransient 1
+#define WritePersistent 2
+#define ReadPersistent 3
+#define WriteAll 4 /* doesn't go to device */
+#define ReadFixable 5
+#define Modes 6
+
+#define ClearErrors 31
+#define ClearFaults 30
+
+#define AllPersist 100 /* internal use only */
+#define NoPersist 101
+
+#define ModeMask 0x1f
+#define ModeShift 5
+
+#define MaxFault 50
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/raid/md_u.h>
+#include <linux/slab.h>
+#include "md.h"
+#include <linux/seq_file.h>
+
+
+static void faulty_fail(struct bio *bio, int error)
+{
+ struct bio *b = bio->bi_private;
+
+ b->bi_iter.bi_size = bio->bi_iter.bi_size;
+ b->bi_iter.bi_sector = bio->bi_iter.bi_sector;
+
+ bio_put(bio);
+
+ bio_io_error(b);
+}
+
+struct faulty_conf {
+ int period[Modes];
+ atomic_t counters[Modes];
+ sector_t faults[MaxFault];
+ int modes[MaxFault];
+ int nfaults;
+ struct md_rdev *rdev;
+};
+
+static int check_mode(struct faulty_conf *conf, int mode)
+{
+ if (conf->period[mode] == 0 &&
+ atomic_read(&conf->counters[mode]) <= 0)
+ return 0; /* no failure, no decrement */
+
+
+ if (atomic_dec_and_test(&conf->counters[mode])) {
+ if (conf->period[mode])
+ atomic_set(&conf->counters[mode], conf->period[mode]);
+ return 1;
+ }
+ return 0;
+}
+
+static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir)
+{
+ /* If we find a ReadFixable sector, we fix it ... */
+ int i;
+ for (i=0; i<conf->nfaults; i++)
+ if (conf->faults[i] >= start &&
+ conf->faults[i] < end) {
+ /* found it ... */
+ switch (conf->modes[i] * 2 + dir) {
+ case WritePersistent*2+WRITE: return 1;
+ case ReadPersistent*2+READ: return 1;
+ case ReadFixable*2+READ: return 1;
+ case ReadFixable*2+WRITE:
+ conf->modes[i] = NoPersist;
+ return 0;
+ case AllPersist*2+READ:
+ case AllPersist*2+WRITE: return 1;
+ default:
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
+{
+ int i;
+ int n = conf->nfaults;
+ for (i=0; i<conf->nfaults; i++)
+ if (conf->faults[i] == start) {
+ switch(mode) {
+ case NoPersist: conf->modes[i] = mode; return;
+ case WritePersistent:
+ if (conf->modes[i] == ReadPersistent ||
+ conf->modes[i] == ReadFixable)
+ conf->modes[i] = AllPersist;
+ else
+ conf->modes[i] = WritePersistent;
+ return;
+ case ReadPersistent:
+ if (conf->modes[i] == WritePersistent)
+ conf->modes[i] = AllPersist;
+ else
+ conf->modes[i] = ReadPersistent;
+ return;
+ case ReadFixable:
+ if (conf->modes[i] == WritePersistent ||
+ conf->modes[i] == ReadPersistent)
+ conf->modes[i] = AllPersist;
+ else
+ conf->modes[i] = ReadFixable;
+ return;
+ }
+ } else if (conf->modes[i] == NoPersist)
+ n = i;
+
+ if (n >= MaxFault)
+ return;
+ conf->faults[n] = start;
+ conf->modes[n] = mode;
+ if (conf->nfaults == n)
+ conf->nfaults = n+1;
+}
+
+static void make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct faulty_conf *conf = mddev->private;
+ int failit = 0;
+
+ if (bio_data_dir(bio) == WRITE) {
+ /* write request */
+ if (atomic_read(&conf->counters[WriteAll])) {
+ /* special case - don't decrement, don't generic_make_request,
+ * just fail immediately
+ */
+ bio_endio(bio, -EIO);
+ return;
+ }
+
+ if (check_sector(conf, bio->bi_iter.bi_sector,
+ bio_end_sector(bio), WRITE))
+ failit = 1;
+ if (check_mode(conf, WritePersistent)) {
+ add_sector(conf, bio->bi_iter.bi_sector,
+ WritePersistent);
+ failit = 1;
+ }
+ if (check_mode(conf, WriteTransient))
+ failit = 1;
+ } else {
+ /* read request */
+ if (check_sector(conf, bio->bi_iter.bi_sector,
+ bio_end_sector(bio), READ))
+ failit = 1;
+ if (check_mode(conf, ReadTransient))
+ failit = 1;
+ if (check_mode(conf, ReadPersistent)) {
+ add_sector(conf, bio->bi_iter.bi_sector,
+ ReadPersistent);
+ failit = 1;
+ }
+ if (check_mode(conf, ReadFixable)) {
+ add_sector(conf, bio->bi_iter.bi_sector,
+ ReadFixable);
+ failit = 1;
+ }
+ }
+ if (failit) {
+ struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
+
+ b->bi_bdev = conf->rdev->bdev;
+ b->bi_private = bio;
+ b->bi_end_io = faulty_fail;
+ bio = b;
+ } else
+ bio->bi_bdev = conf->rdev->bdev;
+
+ generic_make_request(bio);
+}
+
+static void status(struct seq_file *seq, struct mddev *mddev)
+{
+ struct faulty_conf *conf = mddev->private;
+ int n;
+
+ if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
+ seq_printf(seq, " WriteTransient=%d(%d)",
+ n, conf->period[WriteTransient]);
+
+ if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
+ seq_printf(seq, " ReadTransient=%d(%d)",
+ n, conf->period[ReadTransient]);
+
+ if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
+ seq_printf(seq, " WritePersistent=%d(%d)",
+ n, conf->period[WritePersistent]);
+
+ if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
+ seq_printf(seq, " ReadPersistent=%d(%d)",
+ n, conf->period[ReadPersistent]);
+
+
+ if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
+ seq_printf(seq, " ReadFixable=%d(%d)",
+ n, conf->period[ReadFixable]);
+
+ if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
+ seq_printf(seq, " WriteAll");
+
+ seq_printf(seq, " nfaults=%d", conf->nfaults);
+}
+
+
+static int reshape(struct mddev *mddev)
+{
+ int mode = mddev->new_layout & ModeMask;
+ int count = mddev->new_layout >> ModeShift;
+ struct faulty_conf *conf = mddev->private;
+
+ if (mddev->new_layout < 0)
+ return 0;
+
+ /* new layout */
+ if (mode == ClearFaults)
+ conf->nfaults = 0;
+ else if (mode == ClearErrors) {
+ int i;
+ for (i=0 ; i < Modes ; i++) {
+ conf->period[i] = 0;
+ atomic_set(&conf->counters[i], 0);
+ }
+ } else if (mode < Modes) {
+ conf->period[mode] = count;
+ if (!count) count++;
+ atomic_set(&conf->counters[mode], count);
+ } else
+ return -EINVAL;
+ mddev->new_layout = -1;
+ mddev->layout = -1; /* makes sure further changes come through */
+ return 0;
+}
+
+static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ WARN_ONCE(raid_disks,
+ "%s does not support generic reshape\n", __func__);
+
+ if (sectors == 0)
+ return mddev->dev_sectors;
+
+ return sectors;
+}
+
+static int run(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ int i;
+ struct faulty_conf *conf;
+
+ if (md_check_no_bitmap(mddev))
+ return -EINVAL;
+
+ conf = kmalloc(sizeof(*conf), GFP_KERNEL);
+ if (!conf)
+ return -ENOMEM;
+
+ for (i=0; i<Modes; i++) {
+ atomic_set(&conf->counters[i], 0);
+ conf->period[i] = 0;
+ }
+ conf->nfaults = 0;
+
+ rdev_for_each(rdev, mddev) {
+ conf->rdev = rdev;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ }
+
+ md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
+ mddev->private = conf;
+
+ reshape(mddev);
+
+ return 0;
+}
+
+static void faulty_free(struct mddev *mddev, void *priv)
+{
+ struct faulty_conf *conf = priv;
+
+ kfree(conf);
+}
+
+static struct md_personality faulty_personality =
+{
+ .name = "faulty",
+ .level = LEVEL_FAULTY,
+ .owner = THIS_MODULE,
+ .make_request = make_request,
+ .run = run,
+ .free = faulty_free,
+ .status = status,
+ .check_reshape = reshape,
+ .size = faulty_size,
+};
+
+static int __init raid_init(void)
+{
+ return register_md_personality(&faulty_personality);
+}
+
+static void raid_exit(void)
+{
+ unregister_md_personality(&faulty_personality);
+}
+
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Fault injection personality for MD");
+MODULE_ALIAS("md-personality-10"); /* faulty */
+MODULE_ALIAS("md-faulty");
+MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
new file mode 100644
index 000000000..fa7d577f3
--- /dev/null
+++ b/drivers/md/linear.c
@@ -0,0 +1,360 @@
+/*
+ linear.c : Multiple Devices driver for Linux
+ Copyright (C) 1994-96 Marc ZYNGIER
+ <zyngier@ufr-info-p7.ibp.fr> or
+ <maz@gloups.fdn.fr>
+
+ Linear mode management functions.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "md.h"
+#include "linear.h"
+
+/*
+ * find which device holds a particular offset
+ */
+static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
+{
+ int lo, mid, hi;
+ struct linear_conf *conf;
+
+ lo = 0;
+ hi = mddev->raid_disks - 1;
+ conf = mddev->private;
+
+ /*
+ * Binary Search
+ */
+
+ while (hi > lo) {
+
+ mid = (hi + lo) / 2;
+ if (sector < conf->disks[mid].end_sector)
+ hi = mid;
+ else
+ lo = mid + 1;
+ }
+
+ return conf->disks + lo;
+}
+
+/**
+ * linear_mergeable_bvec -- tell bio layer if two requests can be merged
+ * @q: request queue
+ * @bvm: properties of new bio
+ * @biovec: the request that could be merged to it.
+ *
+ * Return amount of bytes we can take at this offset
+ */
+static int linear_mergeable_bvec(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct dev_info *dev0;
+ unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ int maxbytes = biovec->bv_len;
+ struct request_queue *subq;
+
+ dev0 = which_dev(mddev, sector);
+ maxsectors = dev0->end_sector - sector;
+ subq = bdev_get_queue(dev0->rdev->bdev);
+ if (subq->merge_bvec_fn) {
+ bvm->bi_bdev = dev0->rdev->bdev;
+ bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
+ maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
+ biovec));
+ }
+
+ if (maxsectors < bio_sectors)
+ maxsectors = 0;
+ else
+ maxsectors -= bio_sectors;
+
+ if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
+ return maxbytes;
+
+ if (maxsectors > (maxbytes >> 9))
+ return maxbytes;
+ else
+ return maxsectors << 9;
+}
+
+static int linear_congested(struct mddev *mddev, int bits)
+{
+ struct linear_conf *conf;
+ int i, ret = 0;
+
+ conf = mddev->private;
+
+ for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+
+ return ret;
+}
+
+static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ struct linear_conf *conf;
+ sector_t array_sectors;
+
+ conf = mddev->private;
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+ array_sectors = conf->array_sectors;
+
+ return array_sectors;
+}
+
+static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
+{
+ struct linear_conf *conf;
+ struct md_rdev *rdev;
+ int i, cnt;
+ bool discard_supported = false;
+
+ conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
+ GFP_KERNEL);
+ if (!conf)
+ return NULL;
+
+ cnt = 0;
+ conf->array_sectors = 0;
+
+ rdev_for_each(rdev, mddev) {
+ int j = rdev->raid_disk;
+ struct dev_info *disk = conf->disks + j;
+ sector_t sectors;
+
+ if (j < 0 || j >= raid_disks || disk->rdev) {
+ printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ disk->rdev = rdev;
+ if (mddev->chunk_sectors) {
+ sectors = rdev->sectors;
+ sector_div(sectors, mddev->chunk_sectors);
+ rdev->sectors = sectors * mddev->chunk_sectors;
+ }
+
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+
+ conf->array_sectors += rdev->sectors;
+ cnt++;
+
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ discard_supported = true;
+ }
+ if (cnt != raid_disks) {
+ printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ if (!discard_supported)
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
+ /*
+ * Here we calculate the device offsets.
+ */
+ conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
+
+ for (i = 1; i < raid_disks; i++)
+ conf->disks[i].end_sector =
+ conf->disks[i-1].end_sector +
+ conf->disks[i].rdev->sectors;
+
+ return conf;
+
+out:
+ kfree(conf);
+ return NULL;
+}
+
+static int linear_run (struct mddev *mddev)
+{
+ struct linear_conf *conf;
+ int ret;
+
+ if (md_check_no_bitmap(mddev))
+ return -EINVAL;
+ conf = linear_conf(mddev, mddev->raid_disks);
+
+ if (!conf)
+ return 1;
+ mddev->private = conf;
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+
+ ret = md_integrity_register(mddev);
+ if (ret) {
+ kfree(conf);
+ mddev->private = NULL;
+ }
+ return ret;
+}
+
+static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
+{
+ /* Adding a drive to a linear array allows the array to grow.
+ * It is permitted if the new drive has a matching superblock
+ * already on it, with raid_disk equal to raid_disks.
+ * It is achieved by creating a new linear_private_data structure
+ * and swapping it in in-place of the current one.
+ * The current one is never freed until the array is stopped.
+ * This avoids races.
+ */
+ struct linear_conf *newconf, *oldconf;
+
+ if (rdev->saved_raid_disk != mddev->raid_disks)
+ return -EINVAL;
+
+ rdev->raid_disk = rdev->saved_raid_disk;
+ rdev->saved_raid_disk = -1;
+
+ newconf = linear_conf(mddev,mddev->raid_disks+1);
+
+ if (!newconf)
+ return -ENOMEM;
+
+ mddev_suspend(mddev);
+ oldconf = mddev->private;
+ mddev->raid_disks++;
+ mddev->private = newconf;
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ mddev_resume(mddev);
+ revalidate_disk(mddev->gendisk);
+ kfree(oldconf);
+ return 0;
+}
+
+static void linear_free(struct mddev *mddev, void *priv)
+{
+ struct linear_conf *conf = priv;
+
+ kfree(conf);
+}
+
+static void linear_make_request(struct mddev *mddev, struct bio *bio)
+{
+ char b[BDEVNAME_SIZE];
+ struct dev_info *tmp_dev;
+ struct bio *split;
+ sector_t start_sector, end_sector, data_offset;
+
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
+ return;
+ }
+
+ do {
+ tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+ bio->bi_bdev = tmp_dev->rdev->bdev;
+
+ if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
+ bio->bi_iter.bi_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to
+ * split it.
+ */
+ split = bio_split(bio, end_sector -
+ bio->bi_iter.bi_sector,
+ GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
+
+ split->bi_iter.bi_sector = split->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((split->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(split, 0);
+ } else
+ generic_make_request(split);
+ } while (split != bio);
+ return;
+
+out_of_bounds:
+ printk(KERN_ERR
+ "md/linear:%s: make_request: Sector %llu out of bounds on "
+ "dev %s: %llu sectors, offset %llu\n",
+ mdname(mddev),
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bdevname(tmp_dev->rdev->bdev, b),
+ (unsigned long long)tmp_dev->rdev->sectors,
+ (unsigned long long)start_sector);
+ bio_io_error(bio);
+}
+
+static void linear_status (struct seq_file *seq, struct mddev *mddev)
+{
+
+ seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
+}
+
+static void linear_quiesce(struct mddev *mddev, int state)
+{
+}
+
+static struct md_personality linear_personality =
+{
+ .name = "linear",
+ .level = LEVEL_LINEAR,
+ .owner = THIS_MODULE,
+ .make_request = linear_make_request,
+ .run = linear_run,
+ .free = linear_free,
+ .status = linear_status,
+ .hot_add_disk = linear_add,
+ .size = linear_size,
+ .quiesce = linear_quiesce,
+ .congested = linear_congested,
+ .mergeable_bvec = linear_mergeable_bvec,
+};
+
+static int __init linear_init (void)
+{
+ return register_md_personality (&linear_personality);
+}
+
+static void linear_exit (void)
+{
+ unregister_md_personality (&linear_personality);
+}
+
+module_init(linear_init);
+module_exit(linear_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD");
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
+MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
new file mode 100644
index 000000000..b685ddd7d
--- /dev/null
+++ b/drivers/md/linear.h
@@ -0,0 +1,15 @@
+#ifndef _LINEAR_H
+#define _LINEAR_H
+
+struct dev_info {
+ struct md_rdev *rdev;
+ sector_t end_sector;
+};
+
+struct linear_conf
+{
+ struct rcu_head rcu;
+ sector_t array_sectors;
+ struct dev_info disks[0];
+};
+#endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
new file mode 100644
index 000000000..fcfc4b9b2
--- /dev/null
+++ b/drivers/md/md-cluster.c
@@ -0,0 +1,965 @@
+/*
+ * Copyright (C) 2015, SUSE
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/dlm.h>
+#include <linux/sched.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "bitmap.h"
+#include "md-cluster.h"
+
+#define LVB_SIZE 64
+#define NEW_DEV_TIMEOUT 5000
+
+struct dlm_lock_resource {
+ dlm_lockspace_t *ls;
+ struct dlm_lksb lksb;
+ char *name; /* lock name. */
+ uint32_t flags; /* flags to pass to dlm_lock() */
+ struct completion completion; /* completion for synchronized locking */
+ void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
+ struct mddev *mddev; /* pointing back to mddev. */
+};
+
+struct suspend_info {
+ int slot;
+ sector_t lo;
+ sector_t hi;
+ struct list_head list;
+};
+
+struct resync_info {
+ __le64 lo;
+ __le64 hi;
+};
+
+/* md_cluster_info flags */
+#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
+
+
+struct md_cluster_info {
+ /* dlm lock space and resources for clustered raid. */
+ dlm_lockspace_t *lockspace;
+ int slot_number;
+ struct completion completion;
+ struct dlm_lock_resource *sb_lock;
+ struct mutex sb_mutex;
+ struct dlm_lock_resource *bitmap_lockres;
+ struct list_head suspend_list;
+ spinlock_t suspend_lock;
+ struct md_thread *recovery_thread;
+ unsigned long recovery_map;
+ /* communication loc resources */
+ struct dlm_lock_resource *ack_lockres;
+ struct dlm_lock_resource *message_lockres;
+ struct dlm_lock_resource *token_lockres;
+ struct dlm_lock_resource *no_new_dev_lockres;
+ struct md_thread *recv_thread;
+ struct completion newdisk_completion;
+ unsigned long state;
+};
+
+enum msg_type {
+ METADATA_UPDATED = 0,
+ RESYNCING,
+ NEWDISK,
+ REMOVE,
+ RE_ADD,
+};
+
+struct cluster_msg {
+ int type;
+ int slot;
+ /* TODO: Unionize this for smaller footprint */
+ sector_t low;
+ sector_t high;
+ char uuid[16];
+ int raid_slot;
+};
+
+static void sync_ast(void *arg)
+{
+ struct dlm_lock_resource *res;
+
+ res = (struct dlm_lock_resource *) arg;
+ complete(&res->completion);
+}
+
+static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
+{
+ int ret = 0;
+
+ init_completion(&res->completion);
+ ret = dlm_lock(res->ls, mode, &res->lksb,
+ res->flags, res->name, strlen(res->name),
+ 0, sync_ast, res, res->bast);
+ if (ret)
+ return ret;
+ wait_for_completion(&res->completion);
+ return res->lksb.sb_status;
+}
+
+static int dlm_unlock_sync(struct dlm_lock_resource *res)
+{
+ return dlm_lock_sync(res, DLM_LOCK_NL);
+}
+
+static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
+ char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
+{
+ struct dlm_lock_resource *res = NULL;
+ int ret, namelen;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+ if (!res)
+ return NULL;
+ res->ls = cinfo->lockspace;
+ res->mddev = mddev;
+ namelen = strlen(name);
+ res->name = kzalloc(namelen + 1, GFP_KERNEL);
+ if (!res->name) {
+ pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
+ goto out_err;
+ }
+ strlcpy(res->name, name, namelen + 1);
+ if (with_lvb) {
+ res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
+ if (!res->lksb.sb_lvbptr) {
+ pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
+ goto out_err;
+ }
+ res->flags = DLM_LKF_VALBLK;
+ }
+
+ if (bastfn)
+ res->bast = bastfn;
+
+ res->flags |= DLM_LKF_EXPEDITE;
+
+ ret = dlm_lock_sync(res, DLM_LOCK_NL);
+ if (ret) {
+ pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
+ goto out_err;
+ }
+ res->flags &= ~DLM_LKF_EXPEDITE;
+ res->flags |= DLM_LKF_CONVERT;
+
+ return res;
+out_err:
+ kfree(res->lksb.sb_lvbptr);
+ kfree(res->name);
+ kfree(res);
+ return NULL;
+}
+
+static void lockres_free(struct dlm_lock_resource *res)
+{
+ if (!res)
+ return;
+
+ init_completion(&res->completion);
+ dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ wait_for_completion(&res->completion);
+
+ kfree(res->name);
+ kfree(res->lksb.sb_lvbptr);
+ kfree(res);
+}
+
+static char *pretty_uuid(char *dest, char *src)
+{
+ int i, len = 0;
+
+ for (i = 0; i < 16; i++) {
+ if (i == 4 || i == 6 || i == 8 || i == 10)
+ len += sprintf(dest + len, "-");
+ len += sprintf(dest + len, "%02x", (__u8)src[i]);
+ }
+ return dest;
+}
+
+static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
+ sector_t lo, sector_t hi)
+{
+ struct resync_info *ri;
+
+ ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
+ ri->lo = cpu_to_le64(lo);
+ ri->hi = cpu_to_le64(hi);
+}
+
+static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
+{
+ struct resync_info ri;
+ struct suspend_info *s = NULL;
+ sector_t hi = 0;
+
+ dlm_lock_sync(lockres, DLM_LOCK_CR);
+ memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
+ hi = le64_to_cpu(ri.hi);
+ if (ri.hi > 0) {
+ s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
+ if (!s)
+ goto out;
+ s->hi = hi;
+ s->lo = le64_to_cpu(ri.lo);
+ }
+ dlm_unlock_sync(lockres);
+out:
+ return s;
+}
+
+static void recover_bitmaps(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct dlm_lock_resource *bm_lockres;
+ char str[64];
+ int slot, ret;
+ struct suspend_info *s, *tmp;
+ sector_t lo, hi;
+
+ while (cinfo->recovery_map) {
+ slot = fls64((u64)cinfo->recovery_map) - 1;
+
+ /* Clear suspend_area associated with the bitmap */
+ spin_lock_irq(&cinfo->suspend_lock);
+ list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
+ if (slot == s->slot) {
+ list_del(&s->list);
+ kfree(s);
+ }
+ spin_unlock_irq(&cinfo->suspend_lock);
+
+ snprintf(str, 64, "bitmap%04d", slot);
+ bm_lockres = lockres_init(mddev, str, NULL, 1);
+ if (!bm_lockres) {
+ pr_err("md-cluster: Cannot initialize bitmaps\n");
+ goto clear_bit;
+ }
+
+ ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+ if (ret) {
+ pr_err("md-cluster: Could not DLM lock %s: %d\n",
+ str, ret);
+ goto clear_bit;
+ }
+ ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
+ if (ret) {
+ pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
+ goto dlm_unlock;
+ }
+ if (hi > 0) {
+ /* TODO:Wait for current resync to get over */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ if (lo < mddev->recovery_cp)
+ mddev->recovery_cp = lo;
+ md_check_recovery(mddev);
+ }
+dlm_unlock:
+ dlm_unlock_sync(bm_lockres);
+clear_bit:
+ clear_bit(slot, &cinfo->recovery_map);
+ }
+}
+
+static void recover_prep(void *arg)
+{
+}
+
+static void recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct mddev *mddev = arg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
+ mddev->bitmap_info.cluster_name,
+ slot->nodeid, slot->slot,
+ cinfo->slot_number);
+ set_bit(slot->slot - 1, &cinfo->recovery_map);
+ if (!cinfo->recovery_thread) {
+ cinfo->recovery_thread = md_register_thread(recover_bitmaps,
+ mddev, "recover");
+ if (!cinfo->recovery_thread) {
+ pr_warn("md-cluster: Could not create recovery thread\n");
+ return;
+ }
+ }
+ md_wakeup_thread(cinfo->recovery_thread);
+}
+
+static void recover_done(void *arg, struct dlm_slot *slots,
+ int num_slots, int our_slot,
+ uint32_t generation)
+{
+ struct mddev *mddev = arg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ cinfo->slot_number = our_slot;
+ complete(&cinfo->completion);
+}
+
+static const struct dlm_lockspace_ops md_ls_ops = {
+ .recover_prep = recover_prep,
+ .recover_slot = recover_slot,
+ .recover_done = recover_done,
+};
+
+/*
+ * The BAST function for the ack lock resource
+ * This function wakes up the receive thread in
+ * order to receive and process the message.
+ */
+static void ack_bast(void *arg, int mode)
+{
+ struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
+ struct md_cluster_info *cinfo = res->mddev->cluster_info;
+
+ if (mode == DLM_LOCK_EX)
+ md_wakeup_thread(cinfo->recv_thread);
+}
+
+static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+{
+ struct suspend_info *s, *tmp;
+
+ list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
+ if (slot == s->slot) {
+ pr_info("%s:%d Deleting suspend_info: %d\n",
+ __func__, __LINE__, slot);
+ list_del(&s->list);
+ kfree(s);
+ break;
+ }
+}
+
+static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+{
+ spin_lock_irq(&cinfo->suspend_lock);
+ __remove_suspend_info(cinfo, slot);
+ spin_unlock_irq(&cinfo->suspend_lock);
+}
+
+
+static void process_suspend_info(struct md_cluster_info *cinfo,
+ int slot, sector_t lo, sector_t hi)
+{
+ struct suspend_info *s;
+
+ if (!hi) {
+ remove_suspend_info(cinfo, slot);
+ return;
+ }
+ s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
+ if (!s)
+ return;
+ s->slot = slot;
+ s->lo = lo;
+ s->hi = hi;
+ spin_lock_irq(&cinfo->suspend_lock);
+ /* Remove existing entry (if exists) before adding */
+ __remove_suspend_info(cinfo, slot);
+ list_add(&s->list, &cinfo->suspend_list);
+ spin_unlock_irq(&cinfo->suspend_lock);
+}
+
+static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
+{
+ char disk_uuid[64];
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ char event_name[] = "EVENT=ADD_DEVICE";
+ char raid_slot[16];
+ char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
+ int len;
+
+ len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
+ pretty_uuid(disk_uuid + len, cmsg->uuid);
+ snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
+ pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
+ init_completion(&cinfo->newdisk_completion);
+ set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
+ kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
+ wait_for_completion_timeout(&cinfo->newdisk_completion,
+ NEW_DEV_TIMEOUT);
+ clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
+}
+
+
+static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ md_reload_sb(mddev);
+ dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+}
+
+static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
+{
+ struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+
+ if (rdev)
+ md_kick_rdev_from_array(rdev);
+ else
+ pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
+}
+
+static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
+{
+ struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+
+ if (rdev && test_bit(Faulty, &rdev->flags))
+ clear_bit(Faulty, &rdev->flags);
+ else
+ pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
+}
+
+static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
+{
+ switch (msg->type) {
+ case METADATA_UPDATED:
+ pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
+ __func__, __LINE__, msg->slot);
+ process_metadata_update(mddev, msg);
+ break;
+ case RESYNCING:
+ pr_info("%s: %d Received message: RESYNCING from %d\n",
+ __func__, __LINE__, msg->slot);
+ process_suspend_info(mddev->cluster_info, msg->slot,
+ msg->low, msg->high);
+ break;
+ case NEWDISK:
+ pr_info("%s: %d Received message: NEWDISK from %d\n",
+ __func__, __LINE__, msg->slot);
+ process_add_new_disk(mddev, msg);
+ break;
+ case REMOVE:
+ pr_info("%s: %d Received REMOVE from %d\n",
+ __func__, __LINE__, msg->slot);
+ process_remove_disk(mddev, msg);
+ break;
+ case RE_ADD:
+ pr_info("%s: %d Received RE_ADD from %d\n",
+ __func__, __LINE__, msg->slot);
+ process_readd_disk(mddev, msg);
+ break;
+ default:
+ pr_warn("%s:%d Received unknown message from %d\n",
+ __func__, __LINE__, msg->slot);
+ }
+}
+
+/*
+ * thread for receiving message
+ */
+static void recv_daemon(struct md_thread *thread)
+{
+ struct md_cluster_info *cinfo = thread->mddev->cluster_info;
+ struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
+ struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
+ struct cluster_msg msg;
+
+ /*get CR on Message*/
+ if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
+ pr_err("md/raid1:failed to get CR on MESSAGE\n");
+ return;
+ }
+
+ /* read lvb and wake up thread to process this message_lockres */
+ memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
+ process_recvd_msg(thread->mddev, &msg);
+
+ /*release CR on ack_lockres*/
+ dlm_unlock_sync(ack_lockres);
+ /*up-convert to EX on message_lockres*/
+ dlm_lock_sync(message_lockres, DLM_LOCK_EX);
+ /*get CR on ack_lockres again*/
+ dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ /*release CR on message_lockres*/
+ dlm_unlock_sync(message_lockres);
+}
+
+/* lock_comm()
+ * Takes the lock on the TOKEN lock resource so no other
+ * node can communicate while the operation is underway.
+ */
+static int lock_comm(struct md_cluster_info *cinfo)
+{
+ int error;
+
+ error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+ if (error)
+ pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
+ __func__, __LINE__, error);
+ return error;
+}
+
+static void unlock_comm(struct md_cluster_info *cinfo)
+{
+ dlm_unlock_sync(cinfo->token_lockres);
+}
+
+/* __sendmsg()
+ * This function performs the actual sending of the message. This function is
+ * usually called after performing the encompassing operation
+ * The function:
+ * 1. Grabs the message lockresource in EX mode
+ * 2. Copies the message to the message LVB
+ * 3. Downconverts message lockresource to CR
+ * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
+ * and the other nodes read the message. The thread will wait here until all other
+ * nodes have released ack lock resource.
+ * 5. Downconvert ack lockresource to CR
+ */
+static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+{
+ int error;
+ int slot = cinfo->slot_number - 1;
+
+ cmsg->slot = cpu_to_le32(slot);
+ /*get EX on Message*/
+ error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
+ if (error) {
+ pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
+ goto failed_message;
+ }
+
+ memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
+ sizeof(struct cluster_msg));
+ /*down-convert EX to CR on Message*/
+ error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
+ if (error) {
+ pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
+ error);
+ goto failed_message;
+ }
+
+ /*up-convert CR to EX on Ack*/
+ error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
+ if (error) {
+ pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
+ error);
+ goto failed_ack;
+ }
+
+ /*down-convert EX to CR on Ack*/
+ error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
+ if (error) {
+ pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
+ error);
+ goto failed_ack;
+ }
+
+failed_ack:
+ dlm_unlock_sync(cinfo->message_lockres);
+failed_message:
+ return error;
+}
+
+static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+{
+ int ret;
+
+ lock_comm(cinfo);
+ ret = __sendmsg(cinfo, cmsg);
+ unlock_comm(cinfo);
+ return ret;
+}
+
+static int gather_all_resync_info(struct mddev *mddev, int total_slots)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ int i, ret = 0;
+ struct dlm_lock_resource *bm_lockres;
+ struct suspend_info *s;
+ char str[64];
+
+
+ for (i = 0; i < total_slots; i++) {
+ memset(str, '\0', 64);
+ snprintf(str, 64, "bitmap%04d", i);
+ bm_lockres = lockres_init(mddev, str, NULL, 1);
+ if (!bm_lockres)
+ return -ENOMEM;
+ if (i == (cinfo->slot_number - 1))
+ continue;
+
+ bm_lockres->flags |= DLM_LKF_NOQUEUE;
+ ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+ if (ret == -EAGAIN) {
+ memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
+ s = read_resync_info(mddev, bm_lockres);
+ if (s) {
+ pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
+ __func__, __LINE__,
+ (unsigned long long) s->lo,
+ (unsigned long long) s->hi, i);
+ spin_lock_irq(&cinfo->suspend_lock);
+ s->slot = i;
+ list_add(&s->list, &cinfo->suspend_list);
+ spin_unlock_irq(&cinfo->suspend_lock);
+ }
+ ret = 0;
+ lockres_free(bm_lockres);
+ continue;
+ }
+ if (ret)
+ goto out;
+ /* TODO: Read the disk bitmap sb and check if it needs recovery */
+ dlm_unlock_sync(bm_lockres);
+ lockres_free(bm_lockres);
+ }
+out:
+ return ret;
+}
+
+static int join(struct mddev *mddev, int nodes)
+{
+ struct md_cluster_info *cinfo;
+ int ret, ops_rv;
+ char str[64];
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENOENT;
+
+ cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
+ if (!cinfo)
+ return -ENOMEM;
+
+ init_completion(&cinfo->completion);
+
+ mutex_init(&cinfo->sb_mutex);
+ mddev->cluster_info = cinfo;
+
+ memset(str, 0, 64);
+ pretty_uuid(str, mddev->uuid);
+ ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
+ DLM_LSFL_FS, LVB_SIZE,
+ &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
+ if (ret)
+ goto err;
+ wait_for_completion(&cinfo->completion);
+ if (nodes < cinfo->slot_number) {
+ pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
+ cinfo->slot_number, nodes);
+ ret = -ERANGE;
+ goto err;
+ }
+ cinfo->sb_lock = lockres_init(mddev, "cmd-super",
+ NULL, 0);
+ if (!cinfo->sb_lock) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ /* Initiate the communication resources */
+ ret = -ENOMEM;
+ cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
+ if (!cinfo->recv_thread) {
+ pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
+ goto err;
+ }
+ cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
+ if (!cinfo->message_lockres)
+ goto err;
+ cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
+ if (!cinfo->token_lockres)
+ goto err;
+ cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
+ if (!cinfo->ack_lockres)
+ goto err;
+ cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
+ if (!cinfo->no_new_dev_lockres)
+ goto err;
+
+ /* get sync CR lock on ACK. */
+ if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
+ pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
+ ret);
+ /* get sync CR lock on no-new-dev. */
+ if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
+ pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
+
+
+ pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
+ snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
+ cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
+ if (!cinfo->bitmap_lockres)
+ goto err;
+ if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
+ pr_err("Failed to get bitmap lock\n");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ INIT_LIST_HEAD(&cinfo->suspend_list);
+ spin_lock_init(&cinfo->suspend_lock);
+
+ ret = gather_all_resync_info(mddev, nodes);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ lockres_free(cinfo->message_lockres);
+ lockres_free(cinfo->token_lockres);
+ lockres_free(cinfo->ack_lockres);
+ lockres_free(cinfo->no_new_dev_lockres);
+ lockres_free(cinfo->bitmap_lockres);
+ lockres_free(cinfo->sb_lock);
+ if (cinfo->lockspace)
+ dlm_release_lockspace(cinfo->lockspace, 2);
+ mddev->cluster_info = NULL;
+ kfree(cinfo);
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static int leave(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ if (!cinfo)
+ return 0;
+ md_unregister_thread(&cinfo->recovery_thread);
+ md_unregister_thread(&cinfo->recv_thread);
+ lockres_free(cinfo->message_lockres);
+ lockres_free(cinfo->token_lockres);
+ lockres_free(cinfo->ack_lockres);
+ lockres_free(cinfo->no_new_dev_lockres);
+ lockres_free(cinfo->sb_lock);
+ lockres_free(cinfo->bitmap_lockres);
+ dlm_release_lockspace(cinfo->lockspace, 2);
+ return 0;
+}
+
+/* slot_number(): Returns the MD slot number to use
+ * DLM starts the slot numbers from 1, wheras cluster-md
+ * wants the number to be from zero, so we deduct one
+ */
+static int slot_number(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ return cinfo->slot_number - 1;
+}
+
+static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
+ /* Re-acquire the lock to refresh LVB */
+ dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
+}
+
+static int metadata_update_start(struct mddev *mddev)
+{
+ return lock_comm(mddev->cluster_info);
+}
+
+static int metadata_update_finish(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg;
+ int ret;
+
+ memset(&cmsg, 0, sizeof(cmsg));
+ cmsg.type = cpu_to_le32(METADATA_UPDATED);
+ ret = __sendmsg(cinfo, &cmsg);
+ unlock_comm(cinfo);
+ return ret;
+}
+
+static int metadata_update_cancel(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ return dlm_unlock_sync(cinfo->token_lockres);
+}
+
+static int resync_send(struct mddev *mddev, enum msg_type type,
+ sector_t lo, sector_t hi)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg;
+ int slot = cinfo->slot_number - 1;
+
+ pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
+ (unsigned long long)lo,
+ (unsigned long long)hi);
+ resync_info_update(mddev, lo, hi);
+ cmsg.type = cpu_to_le32(type);
+ cmsg.slot = cpu_to_le32(slot);
+ cmsg.low = cpu_to_le64(lo);
+ cmsg.high = cpu_to_le64(hi);
+ return sendmsg(cinfo, &cmsg);
+}
+
+static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
+{
+ pr_info("%s:%d\n", __func__, __LINE__);
+ return resync_send(mddev, RESYNCING, lo, hi);
+}
+
+static void resync_finish(struct mddev *mddev)
+{
+ pr_info("%s:%d\n", __func__, __LINE__);
+ resync_send(mddev, RESYNCING, 0, 0);
+}
+
+static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ int ret = 0;
+ struct suspend_info *s;
+
+ spin_lock_irq(&cinfo->suspend_lock);
+ if (list_empty(&cinfo->suspend_list))
+ goto out;
+ list_for_each_entry(s, &cinfo->suspend_list, list)
+ if (hi > s->lo && lo < s->hi) {
+ ret = 1;
+ break;
+ }
+out:
+ spin_unlock_irq(&cinfo->suspend_lock);
+ return ret;
+}
+
+static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg;
+ int ret = 0;
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+ char *uuid = sb->device_uuid;
+
+ memset(&cmsg, 0, sizeof(cmsg));
+ cmsg.type = cpu_to_le32(NEWDISK);
+ memcpy(cmsg.uuid, uuid, 16);
+ cmsg.raid_slot = rdev->desc_nr;
+ lock_comm(cinfo);
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret)
+ return ret;
+ cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
+ ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
+ cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
+ /* Some node does not "see" the device */
+ if (ret == -EAGAIN)
+ ret = -ENOENT;
+ else
+ dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+ return ret;
+}
+
+static int add_new_disk_finish(struct mddev *mddev)
+{
+ struct cluster_msg cmsg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ int ret;
+ /* Write sb and inform others */
+ md_update_sb(mddev, 1);
+ cmsg.type = METADATA_UPDATED;
+ ret = __sendmsg(cinfo, &cmsg);
+ unlock_comm(cinfo);
+ return ret;
+}
+
+static int new_disk_ack(struct mddev *mddev, bool ack)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
+ pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
+ return -EINVAL;
+ }
+
+ if (ack)
+ dlm_unlock_sync(cinfo->no_new_dev_lockres);
+ complete(&cinfo->newdisk_completion);
+ return 0;
+}
+
+static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct cluster_msg cmsg;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ cmsg.type = REMOVE;
+ cmsg.raid_slot = rdev->desc_nr;
+ return __sendmsg(cinfo, &cmsg);
+}
+
+static int gather_bitmaps(struct md_rdev *rdev)
+{
+ int sn, err;
+ sector_t lo, hi;
+ struct cluster_msg cmsg;
+ struct mddev *mddev = rdev->mddev;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ cmsg.type = RE_ADD;
+ cmsg.raid_slot = rdev->desc_nr;
+ err = sendmsg(cinfo, &cmsg);
+ if (err)
+ goto out;
+
+ for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
+ if (sn == (cinfo->slot_number - 1))
+ continue;
+ err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
+ if (err) {
+ pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
+ goto out;
+ }
+ if ((hi > 0) && (lo < mddev->recovery_cp))
+ mddev->recovery_cp = lo;
+ }
+out:
+ return err;
+}
+
+static struct md_cluster_operations cluster_ops = {
+ .join = join,
+ .leave = leave,
+ .slot_number = slot_number,
+ .resync_info_update = resync_info_update,
+ .resync_start = resync_start,
+ .resync_finish = resync_finish,
+ .metadata_update_start = metadata_update_start,
+ .metadata_update_finish = metadata_update_finish,
+ .metadata_update_cancel = metadata_update_cancel,
+ .area_resyncing = area_resyncing,
+ .add_new_disk_start = add_new_disk_start,
+ .add_new_disk_finish = add_new_disk_finish,
+ .new_disk_ack = new_disk_ack,
+ .remove_disk = remove_disk,
+ .gather_bitmaps = gather_bitmaps,
+};
+
+static int __init cluster_init(void)
+{
+ pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
+ pr_info("Registering Cluster MD functions\n");
+ register_md_cluster_operations(&cluster_ops, THIS_MODULE);
+ return 0;
+}
+
+static void cluster_exit(void)
+{
+ unregister_md_cluster_operations();
+}
+
+module_init(cluster_init);
+module_exit(cluster_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Clustering support for MD");
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
new file mode 100644
index 000000000..6817ee00e
--- /dev/null
+++ b/drivers/md/md-cluster.h
@@ -0,0 +1,29 @@
+
+
+#ifndef _MD_CLUSTER_H
+#define _MD_CLUSTER_H
+
+#include "md.h"
+
+struct mddev;
+struct md_rdev;
+
+struct md_cluster_operations {
+ int (*join)(struct mddev *mddev, int nodes);
+ int (*leave)(struct mddev *mddev);
+ int (*slot_number)(struct mddev *mddev);
+ void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+ int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
+ void (*resync_finish)(struct mddev *mddev);
+ int (*metadata_update_start)(struct mddev *mddev);
+ int (*metadata_update_finish)(struct mddev *mddev);
+ int (*metadata_update_cancel)(struct mddev *mddev);
+ int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
+ int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
+ int (*add_new_disk_finish)(struct mddev *mddev);
+ int (*new_disk_ack)(struct mddev *mddev, bool ack);
+ int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
+ int (*gather_bitmaps)(struct md_rdev *rdev);
+};
+
+#endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
new file mode 100644
index 000000000..84f9dead6
--- /dev/null
+++ b/drivers/md/md.c
@@ -0,0 +1,9037 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ - persistent bitmap code
+ Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/sysctl.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/hdreg.h>
+#include <linux/proc_fs.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+#include <linux/delay.h>
+#include <linux/raid/md_p.h>
+#include <linux/raid/md_u.h>
+#include <linux/slab.h>
+#include "md.h"
+#include "bitmap.h"
+#include "md-cluster.h"
+
+#ifndef MODULE
+static void autostart_arrays(int part);
+#endif
+
+/* pers_list is a list of registered personalities protected
+ * by pers_lock.
+ * pers_lock does extra service to protect accesses to
+ * mddev->thread when the mutex cannot be held.
+ */
+static LIST_HEAD(pers_list);
+static DEFINE_SPINLOCK(pers_lock);
+
+struct md_cluster_operations *md_cluster_ops;
+EXPORT_SYMBOL(md_cluster_ops);
+struct module *md_cluster_mod;
+EXPORT_SYMBOL(md_cluster_mod);
+
+static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+static struct workqueue_struct *md_wq;
+static struct workqueue_struct *md_misc_wq;
+
+static int remove_and_add_spares(struct mddev *mddev,
+ struct md_rdev *this);
+static void mddev_detach(struct mddev *mddev);
+
+/*
+ * Default number of read corrections we'll attempt on an rdev
+ * before ejecting it from the array. We divide the read error
+ * count by 2 for every hour elapsed between read errors.
+ */
+#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 1000 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwidth if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ * or /sys/block/mdX/md/sync_speed_{min,max}
+ */
+
+static int sysctl_speed_limit_min = 1000;
+static int sysctl_speed_limit_max = 200000;
+static inline int speed_min(struct mddev *mddev)
+{
+ return mddev->sync_speed_min ?
+ mddev->sync_speed_min : sysctl_speed_limit_min;
+}
+
+static inline int speed_max(struct mddev *mddev)
+{
+ return mddev->sync_speed_max ?
+ mddev->sync_speed_max : sysctl_speed_limit_max;
+}
+
+static struct ctl_table_header *raid_table_header;
+
+static struct ctl_table raid_table[] = {
+ {
+ .procname = "speed_limit_min",
+ .data = &sysctl_speed_limit_min,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "speed_limit_max",
+ .data = &sysctl_speed_limit_max,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static struct ctl_table raid_dir_table[] = {
+ {
+ .procname = "raid",
+ .maxlen = 0,
+ .mode = S_IRUGO|S_IXUGO,
+ .child = raid_table,
+ },
+ { }
+};
+
+static struct ctl_table raid_root_table[] = {
+ {
+ .procname = "dev",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_dir_table,
+ },
+ { }
+};
+
+static const struct block_device_operations md_fops;
+
+static int start_readonly;
+
+/* bio_clone_mddev
+ * like bio_clone, but with a local bio set
+ */
+
+struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+ struct mddev *mddev)
+{
+ struct bio *b;
+
+ if (!mddev || !mddev->bio_set)
+ return bio_alloc(gfp_mask, nr_iovecs);
+
+ b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
+ if (!b)
+ return NULL;
+ return b;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+
+struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+ struct mddev *mddev)
+{
+ if (!mddev || !mddev->bio_set)
+ return bio_clone(bio, gfp_mask);
+
+ return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
+}
+EXPORT_SYMBOL_GPL(bio_clone_mddev);
+
+/*
+ * We have a system wide 'event count' that is incremented
+ * on any 'interesting' event, and readers of /proc/mdstat
+ * can use 'poll' or 'select' to find out when the event
+ * count increases.
+ *
+ * Events are:
+ * start array, stop array, error, add device, remove device,
+ * start build, activate spare
+ */
+static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
+static atomic_t md_event_count;
+void md_new_event(struct mddev *mddev)
+{
+ atomic_inc(&md_event_count);
+ wake_up(&md_event_waiters);
+}
+EXPORT_SYMBOL_GPL(md_new_event);
+
+/* Alternate version that can be called from interrupts
+ * when calling sysfs_notify isn't needed.
+ */
+static void md_new_event_inintr(struct mddev *mddev)
+{
+ atomic_inc(&md_event_count);
+ wake_up(&md_event_waiters);
+}
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list.
+ */
+static LIST_HEAD(all_mddevs);
+static DEFINE_SPINLOCK(all_mddevs_lock);
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define for_each_mddev(_mddev,_tmp) \
+ \
+ for (({ spin_lock(&all_mddevs_lock); \
+ _tmp = all_mddevs.next; \
+ _mddev = NULL;}); \
+ ({ if (_tmp != &all_mddevs) \
+ mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
+ spin_unlock(&all_mddevs_lock); \
+ if (_mddev) mddev_put(_mddev); \
+ _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
+ _tmp != &all_mddevs;}); \
+ ({ spin_lock(&all_mddevs_lock); \
+ _tmp = _tmp->next;}) \
+ )
+
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request. By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static void md_make_request(struct request_queue *q, struct bio *bio)
+{
+ const int rw = bio_data_dir(bio);
+ struct mddev *mddev = q->queuedata;
+ unsigned int sectors;
+ int cpu;
+
+ if (mddev == NULL || mddev->pers == NULL
+ || !mddev->ready) {
+ bio_io_error(bio);
+ return;
+ }
+ if (mddev->ro == 1 && unlikely(rw == WRITE)) {
+ bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
+ return;
+ }
+ smp_rmb(); /* Ensure implications of 'active' are visible */
+ rcu_read_lock();
+ if (mddev->suspended) {
+ DEFINE_WAIT(__wait);
+ for (;;) {
+ prepare_to_wait(&mddev->sb_wait, &__wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!mddev->suspended)
+ break;
+ rcu_read_unlock();
+ schedule();
+ rcu_read_lock();
+ }
+ finish_wait(&mddev->sb_wait, &__wait);
+ }
+ atomic_inc(&mddev->active_io);
+ rcu_read_unlock();
+
+ /*
+ * save the sectors now since our bio can
+ * go away inside make_request
+ */
+ sectors = bio_sectors(bio);
+ mddev->pers->make_request(mddev, bio);
+
+ cpu = part_stat_lock();
+ part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+ part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+ part_stat_unlock();
+
+ if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+ wake_up(&mddev->sb_wait);
+}
+
+/* mddev_suspend makes sure no new requests are submitted
+ * to the device, and that any requests that have been submitted
+ * are completely handled.
+ * Once mddev_detach() is called and completes, the module will be
+ * completely unused.
+ */
+void mddev_suspend(struct mddev *mddev)
+{
+ BUG_ON(mddev->suspended);
+ mddev->suspended = 1;
+ synchronize_rcu();
+ wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
+ mddev->pers->quiesce(mddev, 1);
+
+ del_timer_sync(&mddev->safemode_timer);
+}
+EXPORT_SYMBOL_GPL(mddev_suspend);
+
+void mddev_resume(struct mddev *mddev)
+{
+ mddev->suspended = 0;
+ wake_up(&mddev->sb_wait);
+ mddev->pers->quiesce(mddev, 0);
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+}
+EXPORT_SYMBOL_GPL(mddev_resume);
+
+int mddev_congested(struct mddev *mddev, int bits)
+{
+ struct md_personality *pers = mddev->pers;
+ int ret = 0;
+
+ rcu_read_lock();
+ if (mddev->suspended)
+ ret = 1;
+ else if (pers && pers->congested)
+ ret = pers->congested(mddev, bits);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mddev_congested);
+static int md_congested(void *data, int bits)
+{
+ struct mddev *mddev = data;
+ return mddev_congested(mddev, bits);
+}
+
+static int md_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mddev *mddev = q->queuedata;
+ int ret;
+ rcu_read_lock();
+ if (mddev->suspended) {
+ /* Must always allow one vec */
+ if (bvm->bi_size == 0)
+ ret = biovec->bv_len;
+ else
+ ret = 0;
+ } else {
+ struct md_personality *pers = mddev->pers;
+ if (pers && pers->mergeable_bvec)
+ ret = pers->mergeable_bvec(mddev, bvm, biovec);
+ else
+ ret = biovec->bv_len;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+/*
+ * Generic flush handling for md
+ */
+
+static void md_end_flush(struct bio *bio, int err)
+{
+ struct md_rdev *rdev = bio->bi_private;
+ struct mddev *mddev = rdev->mddev;
+
+ rdev_dec_pending(rdev, mddev);
+
+ if (atomic_dec_and_test(&mddev->flush_pending)) {
+ /* The pre-request flush has finished */
+ queue_work(md_wq, &mddev->flush_work);
+ }
+ bio_put(bio);
+}
+
+static void md_submit_flush_data(struct work_struct *ws);
+
+static void submit_flushes(struct work_struct *ws)
+{
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+ struct md_rdev *rdev;
+
+ INIT_WORK(&mddev->flush_work, md_submit_flush_data);
+ atomic_set(&mddev->flush_pending, 1);
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Faulty, &rdev->flags)) {
+ /* Take two references, one is dropped
+ * when request finishes, one after
+ * we reclaim rcu_read_lock
+ */
+ struct bio *bi;
+ atomic_inc(&rdev->nr_pending);
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
+ bi->bi_end_io = md_end_flush;
+ bi->bi_private = rdev;
+ bi->bi_bdev = rdev->bdev;
+ atomic_inc(&mddev->flush_pending);
+ submit_bio(WRITE_FLUSH, bi);
+ rcu_read_lock();
+ rdev_dec_pending(rdev, mddev);
+ }
+ rcu_read_unlock();
+ if (atomic_dec_and_test(&mddev->flush_pending))
+ queue_work(md_wq, &mddev->flush_work);
+}
+
+static void md_submit_flush_data(struct work_struct *ws)
+{
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+ struct bio *bio = mddev->flush_bio;
+
+ if (bio->bi_iter.bi_size == 0)
+ /* an empty barrier - all done */
+ bio_endio(bio, 0);
+ else {
+ bio->bi_rw &= ~REQ_FLUSH;
+ mddev->pers->make_request(mddev, bio);
+ }
+
+ mddev->flush_bio = NULL;
+ wake_up(&mddev->sb_wait);
+}
+
+void md_flush_request(struct mddev *mddev, struct bio *bio)
+{
+ spin_lock_irq(&mddev->lock);
+ wait_event_lock_irq(mddev->sb_wait,
+ !mddev->flush_bio,
+ mddev->lock);
+ mddev->flush_bio = bio;
+ spin_unlock_irq(&mddev->lock);
+
+ INIT_WORK(&mddev->flush_work, submit_flushes);
+ queue_work(md_wq, &mddev->flush_work);
+}
+EXPORT_SYMBOL(md_flush_request);
+
+void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct mddev *mddev = cb->data;
+ md_wakeup_thread(mddev->thread);
+ kfree(cb);
+}
+EXPORT_SYMBOL(md_unplug);
+
+static inline struct mddev *mddev_get(struct mddev *mddev)
+{
+ atomic_inc(&mddev->active);
+ return mddev;
+}
+
+static void mddev_delayed_delete(struct work_struct *ws);
+
+static void mddev_put(struct mddev *mddev)
+{
+ struct bio_set *bs = NULL;
+
+ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+ return;
+ if (!mddev->raid_disks && list_empty(&mddev->disks) &&
+ mddev->ctime == 0 && !mddev->hold_active) {
+ /* Array is not configured at all, and not held active,
+ * so destroy it */
+ list_del_init(&mddev->all_mddevs);
+ bs = mddev->bio_set;
+ mddev->bio_set = NULL;
+ if (mddev->gendisk) {
+ /* We did a probe so need to clean up. Call
+ * queue_work inside the spinlock so that
+ * flush_workqueue() after mddev_find will
+ * succeed in waiting for the work to be done.
+ */
+ INIT_WORK(&mddev->del_work, mddev_delayed_delete);
+ queue_work(md_misc_wq, &mddev->del_work);
+ } else
+ kfree(mddev);
+ }
+ spin_unlock(&all_mddevs_lock);
+ if (bs)
+ bioset_free(bs);
+}
+
+void mddev_init(struct mddev *mddev)
+{
+ mutex_init(&mddev->open_mutex);
+ mutex_init(&mddev->reconfig_mutex);
+ mutex_init(&mddev->bitmap_info.mutex);
+ INIT_LIST_HEAD(&mddev->disks);
+ INIT_LIST_HEAD(&mddev->all_mddevs);
+ init_timer(&mddev->safemode_timer);
+ atomic_set(&mddev->active, 1);
+ atomic_set(&mddev->openers, 0);
+ atomic_set(&mddev->active_io, 0);
+ spin_lock_init(&mddev->lock);
+ atomic_set(&mddev->flush_pending, 0);
+ init_waitqueue_head(&mddev->sb_wait);
+ init_waitqueue_head(&mddev->recovery_wait);
+ mddev->reshape_position = MaxSector;
+ mddev->reshape_backwards = 0;
+ mddev->last_sync_action = "none";
+ mddev->resync_min = 0;
+ mddev->resync_max = MaxSector;
+ mddev->level = LEVEL_NONE;
+}
+EXPORT_SYMBOL_GPL(mddev_init);
+
+static struct mddev *mddev_find(dev_t unit)
+{
+ struct mddev *mddev, *new = NULL;
+
+ if (unit && MAJOR(unit) != MD_MAJOR)
+ unit &= ~((1<<MdpMinorShift)-1);
+
+ retry:
+ spin_lock(&all_mddevs_lock);
+
+ if (unit) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+ if (mddev->unit == unit) {
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ kfree(new);
+ return mddev;
+ }
+
+ if (new) {
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ new->hold_active = UNTIL_IOCTL;
+ return new;
+ }
+ } else if (new) {
+ /* find an unused unit number */
+ static int next_minor = 512;
+ int start = next_minor;
+ int is_free = 0;
+ int dev = 0;
+ while (!is_free) {
+ dev = MKDEV(MD_MAJOR, next_minor);
+ next_minor++;
+ if (next_minor > MINORMASK)
+ next_minor = 0;
+ if (next_minor == start) {
+ /* Oh dear, all in use. */
+ spin_unlock(&all_mddevs_lock);
+ kfree(new);
+ return NULL;
+ }
+
+ is_free = 1;
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+ if (mddev->unit == dev) {
+ is_free = 0;
+ break;
+ }
+ }
+ new->unit = dev;
+ new->md_minor = MINOR(dev);
+ new->hold_active = UNTIL_STOP;
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ return new;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->unit = unit;
+ if (MAJOR(unit) == MD_MAJOR)
+ new->md_minor = MINOR(unit);
+ else
+ new->md_minor = MINOR(unit) >> MdpMinorShift;
+
+ mddev_init(new);
+
+ goto retry;
+}
+
+static struct attribute_group md_redundancy_group;
+
+void mddev_unlock(struct mddev *mddev)
+{
+ if (mddev->to_remove) {
+ /* These cannot be removed under reconfig_mutex as
+ * an access to the files will try to take reconfig_mutex
+ * while holding the file unremovable, which leads to
+ * a deadlock.
+ * So hold set sysfs_active while the remove in happeing,
+ * and anything else which might set ->to_remove or my
+ * otherwise change the sysfs namespace will fail with
+ * -EBUSY if sysfs_active is still set.
+ * We set sysfs_active under reconfig_mutex and elsewhere
+ * test it under the same mutex to ensure its correct value
+ * is seen.
+ */
+ struct attribute_group *to_remove = mddev->to_remove;
+ mddev->to_remove = NULL;
+ mddev->sysfs_active = 1;
+ mutex_unlock(&mddev->reconfig_mutex);
+
+ if (mddev->kobj.sd) {
+ if (to_remove != &md_redundancy_group)
+ sysfs_remove_group(&mddev->kobj, to_remove);
+ if (mddev->pers == NULL ||
+ mddev->pers->sync_request == NULL) {
+ sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+ if (mddev->sysfs_action)
+ sysfs_put(mddev->sysfs_action);
+ mddev->sysfs_action = NULL;
+ }
+ }
+ mddev->sysfs_active = 0;
+ } else
+ mutex_unlock(&mddev->reconfig_mutex);
+
+ /* As we've dropped the mutex we need a spinlock to
+ * make sure the thread doesn't disappear
+ */
+ spin_lock(&pers_lock);
+ md_wakeup_thread(mddev->thread);
+ spin_unlock(&pers_lock);
+}
+EXPORT_SYMBOL_GPL(mddev_unlock);
+
+struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->desc_nr == nr)
+ return rdev;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
+
+static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (rdev->bdev->bd_dev == dev)
+ return rdev;
+
+ return NULL;
+}
+
+static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->bdev->bd_dev == dev)
+ return rdev;
+
+ return NULL;
+}
+
+static struct md_personality *find_pers(int level, char *clevel)
+{
+ struct md_personality *pers;
+ list_for_each_entry(pers, &pers_list, list) {
+ if (level != LEVEL_NONE && pers->level == level)
+ return pers;
+ if (strcmp(pers->name, clevel)==0)
+ return pers;
+ }
+ return NULL;
+}
+
+/* return the offset of the super block in 512byte sectors */
+static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
+{
+ sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
+ return MD_NEW_SIZE_SECTORS(num_sectors);
+}
+
+static int alloc_disk_sb(struct md_rdev *rdev)
+{
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(KERN_ALERT "md: out of memory.\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void md_rdev_clear(struct md_rdev *rdev)
+{
+ if (rdev->sb_page) {
+ put_page(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ rdev->sb_page = NULL;
+ rdev->sb_start = 0;
+ rdev->sectors = 0;
+ }
+ if (rdev->bb_page) {
+ put_page(rdev->bb_page);
+ rdev->bb_page = NULL;
+ }
+ kfree(rdev->badblocks.page);
+ rdev->badblocks.page = NULL;
+}
+EXPORT_SYMBOL_GPL(md_rdev_clear);
+
+static void super_written(struct bio *bio, int error)
+{
+ struct md_rdev *rdev = bio->bi_private;
+ struct mddev *mddev = rdev->mddev;
+
+ if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ printk("md: super_written gets error=%d, uptodate=%d\n",
+ error, test_bit(BIO_UPTODATE, &bio->bi_flags));
+ WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
+ md_error(mddev, rdev);
+ }
+
+ if (atomic_dec_and_test(&mddev->pending_writes))
+ wake_up(&mddev->sb_wait);
+ bio_put(bio);
+}
+
+void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
+ sector_t sector, int size, struct page *page)
+{
+ /* write first size bytes of page to sector of rdev
+ * Increment mddev->pending_writes before returning
+ * and decrement it on completion, waking up sb_wait
+ * if zero is reached.
+ * If an error occurred, call md_error
+ */
+ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
+
+ bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
+ bio->bi_iter.bi_sector = sector;
+ bio_add_page(bio, page, size, 0);
+ bio->bi_private = rdev;
+ bio->bi_end_io = super_written;
+
+ atomic_inc(&mddev->pending_writes);
+ submit_bio(WRITE_FLUSH_FUA, bio);
+}
+
+void md_super_wait(struct mddev *mddev)
+{
+ /* wait for all superblock writes that were scheduled to complete */
+ wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+}
+
+int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
+ struct page *page, int rw, bool metadata_op)
+{
+ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
+ int ret;
+
+ bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
+ rdev->meta_bdev : rdev->bdev;
+ if (metadata_op)
+ bio->bi_iter.bi_sector = sector + rdev->sb_start;
+ else if (rdev->mddev->reshape_position != MaxSector &&
+ (rdev->mddev->reshape_backwards ==
+ (sector >= rdev->mddev->reshape_position)))
+ bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
+ else
+ bio->bi_iter.bi_sector = sector + rdev->data_offset;
+ bio_add_page(bio, page, size, 0);
+ submit_bio_wait(rw, bio);
+
+ ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio_put(bio);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sync_page_io);
+
+static int read_disk_sb(struct md_rdev *rdev, int size)
+{
+ char b[BDEVNAME_SIZE];
+
+ if (rdev->sb_loaded)
+ return 0;
+
+ if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
+ goto fail;
+ rdev->sb_loaded = 1;
+ return 0;
+
+fail:
+ printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
+ bdevname(rdev->bdev,b));
+ return -EINVAL;
+}
+
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ return sb1->set_uuid0 == sb2->set_uuid0 &&
+ sb1->set_uuid1 == sb2->set_uuid1 &&
+ sb1->set_uuid2 == sb2->set_uuid2 &&
+ sb1->set_uuid3 == sb2->set_uuid3;
+}
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
+abort:
+ kfree(tmp1);
+ kfree(tmp2);
+ return ret;
+}
+
+static u32 md_csum_fold(u32 csum)
+{
+ csum = (csum & 0xffff) + (csum >> 16);
+ return (csum & 0xffff) + (csum >> 16);
+}
+
+static unsigned int calc_sb_csum(mdp_super_t *sb)
+{
+ u64 newcsum = 0;
+ u32 *sb32 = (u32*)sb;
+ int i;
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+
+ for (i = 0; i < MD_SB_BYTES/4 ; i++)
+ newcsum += sb32[i];
+ csum = (newcsum & 0xffffffff) + (newcsum>>32);
+
+#ifdef CONFIG_ALPHA
+ /* This used to use csum_partial, which was wrong for several
+ * reasons including that different results are returned on
+ * different architectures. It isn't critical that we get exactly
+ * the same return value as before (we always csum_fold before
+ * testing, and that removes any differences). However as we
+ * know that csum_partial always returned a 16bit value on
+ * alphas, do a fold to maximise conformity to previous behaviour.
+ */
+ sb->sb_csum = md_csum_fold(disk_csum);
+#else
+ sb->sb_csum = disk_csum;
+#endif
+ return csum;
+}
+
+/*
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
+ * loads and validates a superblock on dev.
+ * if refdev != NULL, compare superblocks on both devices
+ * Return:
+ * 0 - dev has a superblock that is compatible with refdev
+ * 1 - dev has a superblock that is compatible and newer than refdev
+ * so dev should be used as the refdev in future
+ * -EINVAL superblock incompatible or invalid
+ * -othererror e.g. -EIO
+ *
+ * int validate_super(struct mddev *mddev, struct md_rdev *dev)
+ * Verify that dev is acceptable into mddev.
+ * The first time, mddev->raid_disks will be 0, and data from
+ * dev should be merged in. Subsequent calls check that dev
+ * is new enough. Return 0 or -EINVAL
+ *
+ * void sync_super(struct mddev *mddev, struct md_rdev *dev)
+ * Update the superblock for rdev with data in mddev
+ * This does not write to disc.
+ *
+ */
+
+struct super_type {
+ char *name;
+ struct module *owner;
+ int (*load_super)(struct md_rdev *rdev,
+ struct md_rdev *refdev,
+ int minor_version);
+ int (*validate_super)(struct mddev *mddev,
+ struct md_rdev *rdev);
+ void (*sync_super)(struct mddev *mddev,
+ struct md_rdev *rdev);
+ unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
+ sector_t num_sectors);
+ int (*allow_new_offset)(struct md_rdev *rdev,
+ unsigned long long new_offset);
+};
+
+/*
+ * Check that the given mddev has no bitmap.
+ *
+ * This function is called from the run method of all personalities that do not
+ * support bitmaps. It prints an error message and returns non-zero if mddev
+ * has a bitmap. Otherwise, it returns 0.
+ *
+ */
+int md_check_no_bitmap(struct mddev *mddev)
+{
+ if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
+ return 0;
+ printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
+ mdname(mddev), mddev->pers->name);
+ return 1;
+}
+EXPORT_SYMBOL(md_check_no_bitmap);
+
+/*
+ * load_super for 0.90.0
+ */
+static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
+{
+ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ mdp_super_t *sb;
+ int ret;
+
+ /*
+ * Calculate the position of the superblock (512byte sectors),
+ * it's at the end of the disk.
+ *
+ * It also happens to be a multiple of 4Kb.
+ */
+ rdev->sb_start = calc_dev_sboffset(rdev);
+
+ ret = read_disk_sb(rdev, MD_SB_BYTES);
+ if (ret) return ret;
+
+ ret = -EINVAL;
+
+ bdevname(rdev->bdev, b);
+ sb = page_address(rdev->sb_page);
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+ b);
+ goto abort;
+ }
+
+ if (sb->major_version != 0 ||
+ sb->minor_version < 90 ||
+ sb->minor_version > 91) {
+ printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+ sb->major_version, sb->minor_version,
+ b);
+ goto abort;
+ }
+
+ if (sb->raid_disks <= 0)
+ goto abort;
+
+ if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
+ printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+ b);
+ goto abort;
+ }
+
+ rdev->preferred_minor = sb->md_minor;
+ rdev->data_offset = 0;
+ rdev->new_data_offset = 0;
+ rdev->sb_size = MD_SB_BYTES;
+ rdev->badblocks.shift = -1;
+
+ if (sb->level == LEVEL_MULTIPATH)
+ rdev->desc_nr = -1;
+ else
+ rdev->desc_nr = sb->this_disk.number;
+
+ if (!refdev) {
+ ret = 1;
+ } else {
+ __u64 ev1, ev2;
+ mdp_super_t *refsb = page_address(refdev->sb_page);
+ if (!uuid_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ b, bdevname(refdev->bdev,b2));
+ goto abort;
+ }
+ if (!sb_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has same UUID"
+ " but different superblock to %s\n",
+ b, bdevname(refdev->bdev, b2));
+ goto abort;
+ }
+ ev1 = md_event(sb);
+ ev2 = md_event(refsb);
+ if (ev1 > ev2)
+ ret = 1;
+ else
+ ret = 0;
+ }
+ rdev->sectors = rdev->sb_start;
+ /* Limit to 4TB as metadata cannot record more than that.
+ * (not needed for Linear and RAID0 as metadata doesn't
+ * record this size)
+ */
+ if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
+ rdev->sectors = (2ULL << 32) - 2;
+
+ if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
+ /* "this cannot possibly happen" ... */
+ ret = -EINVAL;
+
+ abort:
+ return ret;
+}
+
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
+{
+ mdp_disk_t *desc;
+ mdp_super_t *sb = page_address(rdev->sb_page);
+ __u64 ev1 = md_event(sb);
+
+ rdev->raid_disk = -1;
+ clear_bit(Faulty, &rdev->flags);
+ clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
+ clear_bit(WriteMostly, &rdev->flags);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 0;
+ mddev->minor_version = sb->minor_version;
+ mddev->patch_version = sb->patch_version;
+ mddev->external = 0;
+ mddev->chunk_sectors = sb->chunk_size >> 9;
+ mddev->ctime = sb->ctime;
+ mddev->utime = sb->utime;
+ mddev->level = sb->level;
+ mddev->clevel[0] = 0;
+ mddev->layout = sb->layout;
+ mddev->raid_disks = sb->raid_disks;
+ mddev->dev_sectors = ((sector_t)sb->size) * 2;
+ mddev->events = ev1;
+ mddev->bitmap_info.offset = 0;
+ mddev->bitmap_info.space = 0;
+ /* bitmap can use 60 K after the 4K superblocks */
+ mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
+ mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
+ mddev->reshape_backwards = 0;
+
+ if (mddev->minor_version >= 91) {
+ mddev->reshape_position = sb->reshape_position;
+ mddev->delta_disks = sb->delta_disks;
+ mddev->new_level = sb->new_level;
+ mddev->new_layout = sb->new_layout;
+ mddev->new_chunk_sectors = sb->new_chunk >> 9;
+ if (mddev->delta_disks < 0)
+ mddev->reshape_backwards = 1;
+ } else {
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ }
+
+ if (sb->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else {
+ if (sb->events_hi == sb->cp_events_hi &&
+ sb->events_lo == sb->cp_events_lo) {
+ mddev->recovery_cp = sb->recovery_cp;
+ } else
+ mddev->recovery_cp = 0;
+ }
+
+ memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+ memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+ memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+ memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+
+ mddev->max_disks = MD_SB_DISKS;
+
+ if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
+ mddev->bitmap_info.file == NULL) {
+ mddev->bitmap_info.offset =
+ mddev->bitmap_info.default_offset;
+ mddev->bitmap_info.space =
+ mddev->bitmap_info.default_space;
+ }
+
+ } else if (mddev->pers == NULL) {
+ /* Insist on good event counter while assembling, except
+ * for spares (which don't need an event count) */
+ ++ev1;
+ if (sb->disks[rdev->desc_nr].state & (
+ (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ } else if (mddev->bitmap) {
+ /* if adding to array with a bitmap, then we can accept an
+ * older device ... but not too old.
+ */
+ if (ev1 < mddev->bitmap->events_cleared)
+ return 0;
+ if (ev1 < mddev->events)
+ set_bit(Bitmap_sync, &rdev->flags);
+ } else {
+ if (ev1 < mddev->events)
+ /* just a hot-add of a new device, leave raid_disk at -1 */
+ return 0;
+ }
+
+ if (mddev->level != LEVEL_MULTIPATH) {
+ desc = sb->disks + rdev->desc_nr;
+
+ if (desc->state & (1<<MD_DISK_FAULTY))
+ set_bit(Faulty, &rdev->flags);
+ else if (desc->state & (1<<MD_DISK_SYNC) /* &&
+ desc->raid_disk < mddev->raid_disks */) {
+ set_bit(In_sync, &rdev->flags);
+ rdev->raid_disk = desc->raid_disk;
+ rdev->saved_raid_disk = desc->raid_disk;
+ } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
+ /* active but not in sync implies recovery up to
+ * reshape position. We don't know exactly where
+ * that is, so set to zero for now */
+ if (mddev->minor_version >= 91) {
+ rdev->recovery_offset = 0;
+ rdev->raid_disk = desc->raid_disk;
+ }
+ }
+ if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+ } else /* MULTIPATH are always insync */
+ set_bit(In_sync, &rdev->flags);
+ return 0;
+}
+
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
+{
+ mdp_super_t *sb;
+ struct md_rdev *rdev2;
+ int next_spare = mddev->raid_disks;
+
+ /* make rdev->sb match mddev data..
+ *
+ * 1/ zero out disks
+ * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
+ * 3/ any empty disks < next_spare become removed
+ *
+ * disks[0] gets initialised to REMOVED because
+ * we cannot be sure from other fields if it has
+ * been initialised or not.
+ */
+ int i;
+ int active=0, working=0,failed=0,spare=0,nr_disks=0;
+
+ rdev->sb_size = MD_SB_BYTES;
+
+ sb = page_address(rdev->sb_page);
+
+ memset(sb, 0, sizeof(*sb));
+
+ sb->md_magic = MD_SB_MAGIC;
+ sb->major_version = mddev->major_version;
+ sb->patch_version = mddev->patch_version;
+ sb->gvalid_words = 0; /* ignored */
+ memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+ memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+ memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+ memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+
+ sb->ctime = mddev->ctime;
+ sb->level = mddev->level;
+ sb->size = mddev->dev_sectors / 2;
+ sb->raid_disks = mddev->raid_disks;
+ sb->md_minor = mddev->md_minor;
+ sb->not_persistent = 0;
+ sb->utime = mddev->utime;
+ sb->state = 0;
+ sb->events_hi = (mddev->events>>32);
+ sb->events_lo = (u32)mddev->events;
+
+ if (mddev->reshape_position == MaxSector)
+ sb->minor_version = 90;
+ else {
+ sb->minor_version = 91;
+ sb->reshape_position = mddev->reshape_position;
+ sb->new_level = mddev->new_level;
+ sb->delta_disks = mddev->delta_disks;
+ sb->new_layout = mddev->new_layout;
+ sb->new_chunk = mddev->new_chunk_sectors << 9;
+ }
+ mddev->minor_version = sb->minor_version;
+ if (mddev->in_sync)
+ {
+ sb->recovery_cp = mddev->recovery_cp;
+ sb->cp_events_hi = (mddev->events>>32);
+ sb->cp_events_lo = (u32)mddev->events;
+ if (mddev->recovery_cp == MaxSector)
+ sb->state = (1<< MD_SB_CLEAN);
+ } else
+ sb->recovery_cp = 0;
+
+ sb->layout = mddev->layout;
+ sb->chunk_size = mddev->chunk_sectors << 9;
+
+ if (mddev->bitmap && mddev->bitmap_info.file == NULL)
+ sb->state |= (1<<MD_SB_BITMAP_PRESENT);
+
+ sb->disks[0].state = (1<<MD_DISK_REMOVED);
+ rdev_for_each(rdev2, mddev) {
+ mdp_disk_t *d;
+ int desc_nr;
+ int is_active = test_bit(In_sync, &rdev2->flags);
+
+ if (rdev2->raid_disk >= 0 &&
+ sb->minor_version >= 91)
+ /* we have nowhere to store the recovery_offset,
+ * but if it is not below the reshape_position,
+ * we can piggy-back on that.
+ */
+ is_active = 1;
+ if (rdev2->raid_disk < 0 ||
+ test_bit(Faulty, &rdev2->flags))
+ is_active = 0;
+ if (is_active)
+ desc_nr = rdev2->raid_disk;
+ else
+ desc_nr = next_spare++;
+ rdev2->desc_nr = desc_nr;
+ d = &sb->disks[rdev2->desc_nr];
+ nr_disks++;
+ d->number = rdev2->desc_nr;
+ d->major = MAJOR(rdev2->bdev->bd_dev);
+ d->minor = MINOR(rdev2->bdev->bd_dev);
+ if (is_active)
+ d->raid_disk = rdev2->raid_disk;
+ else
+ d->raid_disk = rdev2->desc_nr; /* compatibility */
+ if (test_bit(Faulty, &rdev2->flags))
+ d->state = (1<<MD_DISK_FAULTY);
+ else if (is_active) {
+ d->state = (1<<MD_DISK_ACTIVE);
+ if (test_bit(In_sync, &rdev2->flags))
+ d->state |= (1<<MD_DISK_SYNC);
+ active++;
+ working++;
+ } else {
+ d->state = 0;
+ spare++;
+ working++;
+ }
+ if (test_bit(WriteMostly, &rdev2->flags))
+ d->state |= (1<<MD_DISK_WRITEMOSTLY);
+ }
+ /* now set the "removed" and "faulty" bits on any missing devices */
+ for (i=0 ; i < mddev->raid_disks ; i++) {
+ mdp_disk_t *d = &sb->disks[i];
+ if (d->state == 0 && d->number == 0) {
+ d->number = i;
+ d->raid_disk = i;
+ d->state = (1<<MD_DISK_REMOVED);
+ d->state |= (1<<MD_DISK_FAULTY);
+ failed++;
+ }
+ }
+ sb->nr_disks = nr_disks;
+ sb->active_disks = active;
+ sb->working_disks = working;
+ sb->failed_disks = failed;
+ sb->spare_disks = spare;
+
+ sb->this_disk = sb->disks[rdev->desc_nr];
+ sb->sb_csum = calc_sb_csum(sb);
+}
+
+/*
+ * rdev_size_change for 0.90.0
+ */
+static unsigned long long
+super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
+{
+ if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
+ return 0; /* component must fit device */
+ if (rdev->mddev->bitmap_info.offset)
+ return 0; /* can't move bitmap */
+ rdev->sb_start = calc_dev_sboffset(rdev);
+ if (!num_sectors || num_sectors > rdev->sb_start)
+ num_sectors = rdev->sb_start;
+ /* Limit to 4TB as metadata cannot record more than that.
+ * 4TB == 2^32 KB, or 2*2^32 sectors.
+ */
+ if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
+ num_sectors = (2ULL << 32) - 2;
+ md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
+ rdev->sb_page);
+ md_super_wait(rdev->mddev);
+ return num_sectors;
+}
+
+static int
+super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
+{
+ /* non-zero offset changes not possible with v0.90 */
+ return new_offset == 0;
+}
+
+/*
+ * version 1 superblock
+ */
+
+static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
+{
+ __le32 disk_csum;
+ u32 csum;
+ unsigned long long newcsum;
+ int size = 256 + le32_to_cpu(sb->max_dev)*2;
+ __le32 *isuper = (__le32*)sb;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ newcsum = 0;
+ for (; size >= 4; size -= 4)
+ newcsum += le32_to_cpu(*isuper++);
+
+ if (size == 2)
+ newcsum += le16_to_cpu(*(__le16*) isuper);
+
+ csum = (newcsum & 0xffffffff) + (newcsum >> 32);
+ sb->sb_csum = disk_csum;
+ return cpu_to_le32(csum);
+}
+
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged);
+static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
+{
+ struct mdp_superblock_1 *sb;
+ int ret;
+ sector_t sb_start;
+ sector_t sectors;
+ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ int bmask;
+
+ /*
+ * Calculate the position of the superblock in 512byte sectors.
+ * It is always aligned to a 4K boundary and
+ * depeding on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ */
+ switch(minor_version) {
+ case 0:
+ sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
+ sb_start -= 8*2;
+ sb_start &= ~(sector_t)(4*2-1);
+ break;
+ case 1:
+ sb_start = 0;
+ break;
+ case 2:
+ sb_start = 8;
+ break;
+ default:
+ return -EINVAL;
+ }
+ rdev->sb_start = sb_start;
+
+ /* superblock is rarely larger than 1K, but it can be larger,
+ * and it is safe to read 4k, so we do that
+ */
+ ret = read_disk_sb(rdev, 4096);
+ if (ret) return ret;
+
+ sb = page_address(rdev->sb_page);
+
+ if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+ sb->major_version != cpu_to_le32(1) ||
+ le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+ le64_to_cpu(sb->super_offset) != rdev->sb_start ||
+ (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
+ return -EINVAL;
+
+ if (calc_sb_1_csum(sb) != sb->sb_csum) {
+ printk("md: invalid superblock checksum on %s\n",
+ bdevname(rdev->bdev,b));
+ return -EINVAL;
+ }
+ if (le64_to_cpu(sb->data_size) < 10) {
+ printk("md: data_size too small on %s\n",
+ bdevname(rdev->bdev,b));
+ return -EINVAL;
+ }
+ if (sb->pad0 ||
+ sb->pad3[0] ||
+ memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
+ /* Some padding is non-zero, might be a new feature */
+ return -EINVAL;
+
+ rdev->preferred_minor = 0xffff;
+ rdev->data_offset = le64_to_cpu(sb->data_offset);
+ rdev->new_data_offset = rdev->data_offset;
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
+ (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
+ rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
+ atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
+
+ rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
+ bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
+ if (rdev->sb_size & bmask)
+ rdev->sb_size = (rdev->sb_size | bmask) + 1;
+
+ if (minor_version
+ && rdev->data_offset < sb_start + (rdev->sb_size/512))
+ return -EINVAL;
+ if (minor_version
+ && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
+ return -EINVAL;
+
+ if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
+ rdev->desc_nr = -1;
+ else
+ rdev->desc_nr = le32_to_cpu(sb->dev_number);
+
+ if (!rdev->bb_page) {
+ rdev->bb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->bb_page)
+ return -ENOMEM;
+ }
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
+ rdev->badblocks.count == 0) {
+ /* need to load the bad block list.
+ * Currently we limit it to one page.
+ */
+ s32 offset;
+ sector_t bb_sector;
+ u64 *bbp;
+ int i;
+ int sectors = le16_to_cpu(sb->bblog_size);
+ if (sectors > (PAGE_SIZE / 512))
+ return -EINVAL;
+ offset = le32_to_cpu(sb->bblog_offset);
+ if (offset == 0)
+ return -EINVAL;
+ bb_sector = (long long)offset;
+ if (!sync_page_io(rdev, bb_sector, sectors << 9,
+ rdev->bb_page, READ, true))
+ return -EIO;
+ bbp = (u64 *)page_address(rdev->bb_page);
+ rdev->badblocks.shift = sb->bblog_shift;
+ for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
+ u64 bb = le64_to_cpu(*bbp);
+ int count = bb & (0x3ff);
+ u64 sector = bb >> 10;
+ sector <<= sb->bblog_shift;
+ count <<= sb->bblog_shift;
+ if (bb + 1 == 0)
+ break;
+ if (md_set_badblocks(&rdev->badblocks,
+ sector, count, 1) == 0)
+ return -EINVAL;
+ }
+ } else if (sb->bblog_offset != 0)
+ rdev->badblocks.shift = 0;
+
+ if (!refdev) {
+ ret = 1;
+ } else {
+ __u64 ev1, ev2;
+ struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
+
+ if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+ sb->level != refsb->level ||
+ sb->layout != refsb->layout ||
+ sb->chunksize != refsb->chunksize) {
+ printk(KERN_WARNING "md: %s has strangely different"
+ " superblock to %s\n",
+ bdevname(rdev->bdev,b),
+ bdevname(refdev->bdev,b2));
+ return -EINVAL;
+ }
+ ev1 = le64_to_cpu(sb->events);
+ ev2 = le64_to_cpu(refsb->events);
+
+ if (ev1 > ev2)
+ ret = 1;
+ else
+ ret = 0;
+ }
+ if (minor_version) {
+ sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
+ sectors -= rdev->data_offset;
+ } else
+ sectors = rdev->sb_start;
+ if (sectors < le64_to_cpu(sb->data_size))
+ return -EINVAL;
+ rdev->sectors = le64_to_cpu(sb->data_size);
+ return ret;
+}
+
+static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+ __u64 ev1 = le64_to_cpu(sb->events);
+
+ rdev->raid_disk = -1;
+ clear_bit(Faulty, &rdev->flags);
+ clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
+ clear_bit(WriteMostly, &rdev->flags);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 1;
+ mddev->patch_version = 0;
+ mddev->external = 0;
+ mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
+ mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+ mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+ mddev->level = le32_to_cpu(sb->level);
+ mddev->clevel[0] = 0;
+ mddev->layout = le32_to_cpu(sb->layout);
+ mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+ mddev->dev_sectors = le64_to_cpu(sb->size);
+ mddev->events = ev1;
+ mddev->bitmap_info.offset = 0;
+ mddev->bitmap_info.space = 0;
+ /* Default location for bitmap is 1K after superblock
+ * using 3K - total of 4K
+ */
+ mddev->bitmap_info.default_offset = 1024 >> 9;
+ mddev->bitmap_info.default_space = (4096-1024) >> 9;
+ mddev->reshape_backwards = 0;
+
+ mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+ memcpy(mddev->uuid, sb->set_uuid, 16);
+
+ mddev->max_disks = (4096-256)/2;
+
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
+ mddev->bitmap_info.file == NULL) {
+ mddev->bitmap_info.offset =
+ (__s32)le32_to_cpu(sb->bitmap_offset);
+ /* Metadata doesn't record how much space is available.
+ * For 1.0, we assume we can use up to the superblock
+ * if before, else to 4K beyond superblock.
+ * For others, assume no change is possible.
+ */
+ if (mddev->minor_version > 0)
+ mddev->bitmap_info.space = 0;
+ else if (mddev->bitmap_info.offset > 0)
+ mddev->bitmap_info.space =
+ 8 - mddev->bitmap_info.offset;
+ else
+ mddev->bitmap_info.space =
+ -mddev->bitmap_info.offset;
+ }
+
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+ mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+ mddev->new_level = le32_to_cpu(sb->new_level);
+ mddev->new_layout = le32_to_cpu(sb->new_layout);
+ mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
+ if (mddev->delta_disks < 0 ||
+ (mddev->delta_disks == 0 &&
+ (le32_to_cpu(sb->feature_map)
+ & MD_FEATURE_RESHAPE_BACKWARDS)))
+ mddev->reshape_backwards = 1;
+ } else {
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ }
+
+ } else if (mddev->pers == NULL) {
+ /* Insist of good event counter while assembling, except for
+ * spares (which don't need an event count) */
+ ++ev1;
+ if (rdev->desc_nr >= 0 &&
+ rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ } else if (mddev->bitmap) {
+ /* If adding to array with a bitmap, then we can accept an
+ * older device, but not too old.
+ */
+ if (ev1 < mddev->bitmap->events_cleared)
+ return 0;
+ if (ev1 < mddev->events)
+ set_bit(Bitmap_sync, &rdev->flags);
+ } else {
+ if (ev1 < mddev->events)
+ /* just a hot-add of a new device, leave raid_disk at -1 */
+ return 0;
+ }
+ if (mddev->level != LEVEL_MULTIPATH) {
+ int role;
+ if (rdev->desc_nr < 0 ||
+ rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
+ role = 0xffff;
+ rdev->desc_nr = -1;
+ } else
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ switch(role) {
+ case 0xffff: /* spare */
+ break;
+ case 0xfffe: /* faulty */
+ set_bit(Faulty, &rdev->flags);
+ break;
+ default:
+ rdev->saved_raid_disk = role;
+ if ((le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RECOVERY_OFFSET)) {
+ rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+ if (!(le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RECOVERY_BITMAP))
+ rdev->saved_raid_disk = -1;
+ } else
+ set_bit(In_sync, &rdev->flags);
+ rdev->raid_disk = role;
+ break;
+ }
+ if (sb->devflags & WriteMostly1)
+ set_bit(WriteMostly, &rdev->flags);
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
+ set_bit(Replacement, &rdev->flags);
+ } else /* MULTIPATH are always insync */
+ set_bit(In_sync, &rdev->flags);
+
+ return 0;
+}
+
+static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct mdp_superblock_1 *sb;
+ struct md_rdev *rdev2;
+ int max_dev, i;
+ /* make rdev->sb match mddev and rdev data. */
+
+ sb = page_address(rdev->sb_page);
+
+ sb->feature_map = 0;
+ sb->pad0 = 0;
+ sb->recovery_offset = cpu_to_le64(0);
+ memset(sb->pad3, 0, sizeof(sb->pad3));
+
+ sb->utime = cpu_to_le64((__u64)mddev->utime);
+ sb->events = cpu_to_le64(mddev->events);
+ if (mddev->in_sync)
+ sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else
+ sb->resync_offset = cpu_to_le64(0);
+
+ sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
+
+ sb->raid_disks = cpu_to_le32(mddev->raid_disks);
+ sb->size = cpu_to_le64(mddev->dev_sectors);
+ sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
+ sb->level = cpu_to_le32(mddev->level);
+ sb->layout = cpu_to_le32(mddev->layout);
+
+ if (test_bit(WriteMostly, &rdev->flags))
+ sb->devflags |= WriteMostly1;
+ else
+ sb->devflags &= ~WriteMostly1;
+ sb->data_offset = cpu_to_le64(rdev->data_offset);
+ sb->data_size = cpu_to_le64(rdev->sectors);
+
+ if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
+ sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
+ sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
+ }
+
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags)) {
+ sb->feature_map |=
+ cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+ sb->recovery_offset =
+ cpu_to_le64(rdev->recovery_offset);
+ if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
+ sb->feature_map |=
+ cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
+ }
+ if (test_bit(Replacement, &rdev->flags))
+ sb->feature_map |=
+ cpu_to_le32(MD_FEATURE_REPLACEMENT);
+
+ if (mddev->reshape_position != MaxSector) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+ sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+ sb->new_layout = cpu_to_le32(mddev->new_layout);
+ sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+ sb->new_level = cpu_to_le32(mddev->new_level);
+ sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
+ if (mddev->delta_disks == 0 &&
+ mddev->reshape_backwards)
+ sb->feature_map
+ |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
+ if (rdev->new_data_offset != rdev->data_offset) {
+ sb->feature_map
+ |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
+ sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
+ - rdev->data_offset));
+ }
+ }
+
+ if (rdev->badblocks.count == 0)
+ /* Nothing to do for bad blocks*/ ;
+ else if (sb->bblog_offset == 0)
+ /* Cannot record bad blocks on this device */
+ md_error(mddev, rdev);
+ else {
+ struct badblocks *bb = &rdev->badblocks;
+ u64 *bbp = (u64 *)page_address(rdev->bb_page);
+ u64 *p = bb->page;
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
+ if (bb->changed) {
+ unsigned seq;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ memset(bbp, 0xff, PAGE_SIZE);
+
+ for (i = 0 ; i < bb->count ; i++) {
+ u64 internal_bb = p[i];
+ u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
+ | BB_LEN(internal_bb));
+ bbp[i] = cpu_to_le64(store_bb);
+ }
+ bb->changed = 0;
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ bb->sector = (rdev->sb_start +
+ (int)le32_to_cpu(sb->bblog_offset));
+ bb->size = le16_to_cpu(sb->bblog_size);
+ }
+ }
+
+ max_dev = 0;
+ rdev_for_each(rdev2, mddev)
+ if (rdev2->desc_nr+1 > max_dev)
+ max_dev = rdev2->desc_nr+1;
+
+ if (max_dev > le32_to_cpu(sb->max_dev)) {
+ int bmask;
+ sb->max_dev = cpu_to_le32(max_dev);
+ rdev->sb_size = max_dev * 2 + 256;
+ bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
+ if (rdev->sb_size & bmask)
+ rdev->sb_size = (rdev->sb_size | bmask) + 1;
+ } else
+ max_dev = le32_to_cpu(sb->max_dev);
+
+ for (i=0; i<max_dev;i++)
+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
+
+ rdev_for_each(rdev2, mddev) {
+ i = rdev2->desc_nr;
+ if (test_bit(Faulty, &rdev2->flags))
+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ else if (test_bit(In_sync, &rdev2->flags))
+ sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else if (rdev2->raid_disk >= 0)
+ sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else
+ sb->dev_roles[i] = cpu_to_le16(0xffff);
+ }
+
+ sb->sb_csum = calc_sb_1_csum(sb);
+}
+
+static unsigned long long
+super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
+{
+ struct mdp_superblock_1 *sb;
+ sector_t max_sectors;
+ if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
+ return 0; /* component must fit device */
+ if (rdev->data_offset != rdev->new_data_offset)
+ return 0; /* too confusing */
+ if (rdev->sb_start < rdev->data_offset) {
+ /* minor versions 1 and 2; superblock before data */
+ max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
+ max_sectors -= rdev->data_offset;
+ if (!num_sectors || num_sectors > max_sectors)
+ num_sectors = max_sectors;
+ } else if (rdev->mddev->bitmap_info.offset) {
+ /* minor version 0 with bitmap we can't move */
+ return 0;
+ } else {
+ /* minor version 0; superblock after data */
+ sector_t sb_start;
+ sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
+ sb_start &= ~(sector_t)(4*2 - 1);
+ max_sectors = rdev->sectors + sb_start - rdev->sb_start;
+ if (!num_sectors || num_sectors > max_sectors)
+ num_sectors = max_sectors;
+ rdev->sb_start = sb_start;
+ }
+ sb = page_address(rdev->sb_page);
+ sb->data_size = cpu_to_le64(num_sectors);
+ sb->super_offset = rdev->sb_start;
+ sb->sb_csum = calc_sb_1_csum(sb);
+ md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
+ rdev->sb_page);
+ md_super_wait(rdev->mddev);
+ return num_sectors;
+
+}
+
+static int
+super_1_allow_new_offset(struct md_rdev *rdev,
+ unsigned long long new_offset)
+{
+ /* All necessary checks on new >= old have been done */
+ struct bitmap *bitmap;
+ if (new_offset >= rdev->data_offset)
+ return 1;
+
+ /* with 1.0 metadata, there is no metadata to tread on
+ * so we can always move back */
+ if (rdev->mddev->minor_version == 0)
+ return 1;
+
+ /* otherwise we must be sure not to step on
+ * any metadata, so stay:
+ * 36K beyond start of superblock
+ * beyond end of badblocks
+ * beyond write-intent bitmap
+ */
+ if (rdev->sb_start + (32+4)*2 > new_offset)
+ return 0;
+ bitmap = rdev->mddev->bitmap;
+ if (bitmap && !rdev->mddev->bitmap_info.file &&
+ rdev->sb_start + rdev->mddev->bitmap_info.offset +
+ bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
+ return 0;
+ if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
+ return 0;
+
+ return 1;
+}
+
+static struct super_type super_types[] = {
+ [0] = {
+ .name = "0.90.0",
+ .owner = THIS_MODULE,
+ .load_super = super_90_load,
+ .validate_super = super_90_validate,
+ .sync_super = super_90_sync,
+ .rdev_size_change = super_90_rdev_size_change,
+ .allow_new_offset = super_90_allow_new_offset,
+ },
+ [1] = {
+ .name = "md-1",
+ .owner = THIS_MODULE,
+ .load_super = super_1_load,
+ .validate_super = super_1_validate,
+ .sync_super = super_1_sync,
+ .rdev_size_change = super_1_rdev_size_change,
+ .allow_new_offset = super_1_allow_new_offset,
+ },
+};
+
+static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (mddev->sync_super) {
+ mddev->sync_super(mddev, rdev);
+ return;
+ }
+
+ BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
+
+ super_types[mddev->major_version].sync_super(mddev, rdev);
+}
+
+static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
+{
+ struct md_rdev *rdev, *rdev2;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev1)
+ rdev_for_each_rcu(rdev2, mddev2)
+ if (rdev->bdev->bd_contains ==
+ rdev2->bdev->bd_contains) {
+ rcu_read_unlock();
+ return 1;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static LIST_HEAD(pending_raid_disks);
+
+/*
+ * Try to register data integrity profile for an mddev
+ *
+ * This is called when an array is started and after a disk has been kicked
+ * from the array. It only succeeds if all working and active component devices
+ * are integrity capable with matching profiles.
+ */
+int md_integrity_register(struct mddev *mddev)
+{
+ struct md_rdev *rdev, *reference = NULL;
+
+ if (list_empty(&mddev->disks))
+ return 0; /* nothing to do */
+ if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
+ return 0; /* shouldn't register, or already is */
+ rdev_for_each(rdev, mddev) {
+ /* skip spares and non-functional disks */
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (rdev->raid_disk < 0)
+ continue;
+ if (!reference) {
+ /* Use the first rdev as the reference */
+ reference = rdev;
+ continue;
+ }
+ /* does this rdev's profile match the reference profile? */
+ if (blk_integrity_compare(reference->bdev->bd_disk,
+ rdev->bdev->bd_disk) < 0)
+ return -EINVAL;
+ }
+ if (!reference || !bdev_get_integrity(reference->bdev))
+ return 0;
+ /*
+ * All component devices are integrity capable and have matching
+ * profiles, register the common profile for the md device.
+ */
+ if (blk_integrity_register(mddev->gendisk,
+ bdev_get_integrity(reference->bdev)) != 0) {
+ printk(KERN_ERR "md: failed to register integrity for %s\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
+ if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+ printk(KERN_ERR "md: failed to create integrity pool for %s\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(md_integrity_register);
+
+/* Disable data integrity if non-capable/non-matching disk is being added */
+void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
+{
+ struct blk_integrity *bi_rdev;
+ struct blk_integrity *bi_mddev;
+
+ if (!mddev->gendisk)
+ return;
+
+ bi_rdev = bdev_get_integrity(rdev->bdev);
+ bi_mddev = blk_get_integrity(mddev->gendisk);
+
+ if (!bi_mddev) /* nothing to do */
+ return;
+ if (rdev->raid_disk < 0) /* skip spares */
+ return;
+ if (bi_rdev && blk_integrity_compare(mddev->gendisk,
+ rdev->bdev->bd_disk) >= 0)
+ return;
+ printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
+ blk_integrity_unregister(mddev->gendisk);
+}
+EXPORT_SYMBOL(md_integrity_add_rdev);
+
+static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
+{
+ char b[BDEVNAME_SIZE];
+ struct kobject *ko;
+ char *s;
+ int err;
+
+ /* prevent duplicates */
+ if (find_rdev(mddev, rdev->bdev->bd_dev))
+ return -EEXIST;
+
+ /* make sure rdev->sectors exceeds mddev->dev_sectors */
+ if (rdev->sectors && (mddev->dev_sectors == 0 ||
+ rdev->sectors < mddev->dev_sectors)) {
+ if (mddev->pers) {
+ /* Cannot change size, so fail
+ * If mddev->level <= 0, then we don't care
+ * about aligning sizes (e.g. linear)
+ */
+ if (mddev->level > 0)
+ return -ENOSPC;
+ } else
+ mddev->dev_sectors = rdev->sectors;
+ }
+
+ /* Verify rdev->desc_nr is unique.
+ * If it is -1, assign a free number, else
+ * check number is not in use
+ */
+ rcu_read_lock();
+ if (rdev->desc_nr < 0) {
+ int choice = 0;
+ if (mddev->pers)
+ choice = mddev->raid_disks;
+ while (md_find_rdev_nr_rcu(mddev, choice))
+ choice++;
+ rdev->desc_nr = choice;
+ } else {
+ if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+ }
+ rcu_read_unlock();
+ if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
+ printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
+ mdname(mddev), mddev->max_disks);
+ return -EBUSY;
+ }
+ bdevname(rdev->bdev,b);
+ while ( (s=strchr(b, '/')) != NULL)
+ *s = '!';
+
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", b);
+
+ if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
+ goto fail;
+
+ ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
+ if (sysfs_create_link(&rdev->kobj, ko, "block"))
+ /* failure here is OK */;
+ rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
+
+ list_add_rcu(&rdev->same_set, &mddev->disks);
+ bd_link_disk_holder(rdev->bdev, mddev->gendisk);
+
+ /* May as well allow recovery to be retried once */
+ mddev->recovery_disabled++;
+
+ return 0;
+
+ fail:
+ printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
+ b, mdname(mddev));
+ return err;
+}
+
+static void md_delayed_delete(struct work_struct *ws)
+{
+ struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
+ kobject_del(&rdev->kobj);
+ kobject_put(&rdev->kobj);
+}
+
+static void unbind_rdev_from_array(struct md_rdev *rdev)
+{
+ char b[BDEVNAME_SIZE];
+
+ bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
+ list_del_rcu(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
+ rdev->mddev = NULL;
+ sysfs_remove_link(&rdev->kobj, "block");
+ sysfs_put(rdev->sysfs_state);
+ rdev->sysfs_state = NULL;
+ rdev->badblocks.count = 0;
+ /* We need to delay this, otherwise we can deadlock when
+ * writing to 'remove' to "dev/state". We also need
+ * to delay it due to rcu usage.
+ */
+ synchronize_rcu();
+ INIT_WORK(&rdev->del_work, md_delayed_delete);
+ kobject_get(&rdev->kobj);
+ queue_work(md_misc_wq, &rdev->del_work);
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by bd_claiming the device.
+ */
+static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
+{
+ int err = 0;
+ struct block_device *bdev;
+ char b[BDEVNAME_SIZE];
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+ shared ? (struct md_rdev *)lock_rdev : rdev);
+ if (IS_ERR(bdev)) {
+ printk(KERN_ERR "md: could not open %s.\n",
+ __bdevname(dev, b));
+ return PTR_ERR(bdev);
+ }
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(struct md_rdev *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+void md_autodetect_dev(dev_t dev);
+
+static void export_rdev(struct md_rdev *rdev)
+{
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_INFO "md: export_rdev(%s)\n",
+ bdevname(rdev->bdev,b));
+ md_rdev_clear(rdev);
+#ifndef MODULE
+ if (test_bit(AutoDetected, &rdev->flags))
+ md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+ unlock_rdev(rdev);
+ kobject_put(&rdev->kobj);
+}
+
+void md_kick_rdev_from_array(struct md_rdev *rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
+
+static void export_array(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ while (!list_empty(&mddev->disks)) {
+ rdev = list_first_entry(&mddev->disks, struct md_rdev,
+ same_set);
+ md_kick_rdev_from_array(rdev);
+ }
+ mddev->raid_disks = 0;
+ mddev->major_version = 0;
+}
+
+static void sync_sbs(struct mddev *mddev, int nospares)
+{
+ /* Update each superblock (in-memory image), but
+ * if we are allowed to, skip spares which already
+ * have the right event counter, or have one earlier
+ * (which would mean they aren't being marked as dirty
+ * with the rest of the array)
+ */
+ struct md_rdev *rdev;
+ rdev_for_each(rdev, mddev) {
+ if (rdev->sb_events == mddev->events ||
+ (nospares &&
+ rdev->raid_disk < 0 &&
+ rdev->sb_events+1 == mddev->events)) {
+ /* Don't update this superblock */
+ rdev->sb_loaded = 2;
+ } else {
+ sync_super(mddev, rdev);
+ rdev->sb_loaded = 1;
+ }
+ }
+}
+
+void md_update_sb(struct mddev *mddev, int force_change)
+{
+ struct md_rdev *rdev;
+ int sync_req;
+ int nospares = 0;
+ int any_badblocks_changed = 0;
+
+ if (mddev->ro) {
+ if (force_change)
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ return;
+ }
+repeat:
+ /* First make sure individual recovery_offsets are correct */
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk >= 0 &&
+ mddev->delta_disks >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ mddev->curr_resync_completed > rdev->recovery_offset)
+ rdev->recovery_offset = mddev->curr_resync_completed;
+
+ }
+ if (!mddev->persistent) {
+ clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ clear_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (!mddev->external) {
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ rdev_for_each(rdev, mddev) {
+ if (rdev->badblocks.changed) {
+ rdev->badblocks.changed = 0;
+ md_ack_all_badblocks(&rdev->badblocks);
+ md_error(mddev, rdev);
+ }
+ clear_bit(Blocked, &rdev->flags);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
+ wake_up(&rdev->blocked_wait);
+ }
+ }
+ wake_up(&mddev->sb_wait);
+ return;
+ }
+
+ spin_lock(&mddev->lock);
+
+ mddev->utime = get_seconds();
+
+ if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+ force_change = 1;
+ if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+ /* just a clean<-> dirty transition, possibly leave spares alone,
+ * though if events isn't the right even/odd, we will have to do
+ * spares after all
+ */
+ nospares = 1;
+ if (force_change)
+ nospares = 0;
+ if (mddev->degraded)
+ /* If the array is degraded, then skipping spares is both
+ * dangerous and fairly pointless.
+ * Dangerous because a device that was removed from the array
+ * might have a event_count that still looks up-to-date,
+ * so it can be re-added without a resync.
+ * Pointless because if there are any spares to skip,
+ * then a recovery will happen and soon that array won't
+ * be degraded any more and the spare can go back to sleep then.
+ */
+ nospares = 0;
+
+ sync_req = mddev->in_sync;
+
+ /* If this is just a dirty<->clean transition, and the array is clean
+ * and 'events' is odd, we can roll back to the previous clean state */
+ if (nospares
+ && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+ && mddev->can_decrease_events
+ && mddev->events != 1) {
+ mddev->events--;
+ mddev->can_decrease_events = 0;
+ } else {
+ /* otherwise we have to go forward and ... */
+ mddev->events ++;
+ mddev->can_decrease_events = nospares;
+ }
+
+ /*
+ * This 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug...
+ */
+ WARN_ON(mddev->events == 0);
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->badblocks.changed)
+ any_badblocks_changed++;
+ if (test_bit(Faulty, &rdev->flags))
+ set_bit(FaultRecorded, &rdev->flags);
+ }
+
+ sync_sbs(mddev, nospares);
+ spin_unlock(&mddev->lock);
+
+ pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
+ mdname(mddev), mddev->in_sync);
+
+ bitmap_update_sb(mddev->bitmap);
+ rdev_for_each(rdev, mddev) {
+ char b[BDEVNAME_SIZE];
+
+ if (rdev->sb_loaded != 1)
+ continue; /* no noise on spare devices */
+
+ if (!test_bit(Faulty, &rdev->flags)) {
+ md_super_write(mddev,rdev,
+ rdev->sb_start, rdev->sb_size,
+ rdev->sb_page);
+ pr_debug("md: (write) %s's sb offset: %llu\n",
+ bdevname(rdev->bdev, b),
+ (unsigned long long)rdev->sb_start);
+ rdev->sb_events = mddev->events;
+ if (rdev->badblocks.size) {
+ md_super_write(mddev, rdev,
+ rdev->badblocks.sector,
+ rdev->badblocks.size << 9,
+ rdev->bb_page);
+ rdev->badblocks.size = 0;
+ }
+
+ } else
+ pr_debug("md: %s (skipping faulty)\n",
+ bdevname(rdev->bdev, b));
+
+ if (mddev->level == LEVEL_MULTIPATH)
+ /* only need to write one superblock... */
+ break;
+ }
+ md_super_wait(mddev);
+ /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
+
+ spin_lock(&mddev->lock);
+ if (mddev->in_sync != sync_req ||
+ test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+ /* have to write it out again */
+ spin_unlock(&mddev->lock);
+ goto repeat;
+ }
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ spin_unlock(&mddev->lock);
+ wake_up(&mddev->sb_wait);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+
+ rdev_for_each(rdev, mddev) {
+ if (test_and_clear_bit(FaultRecorded, &rdev->flags))
+ clear_bit(Blocked, &rdev->flags);
+
+ if (any_badblocks_changed)
+ md_ack_all_badblocks(&rdev->badblocks);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
+ wake_up(&rdev->blocked_wait);
+ }
+}
+EXPORT_SYMBOL(md_update_sb);
+
+static int add_bound_rdev(struct md_rdev *rdev)
+{
+ struct mddev *mddev = rdev->mddev;
+ int err = 0;
+
+ if (!mddev->pers->hot_remove_disk) {
+ /* If there is hot_add_disk but no hot_remove_disk
+ * then added disks for geometry changes,
+ * and should be added immediately.
+ */
+ super_types[mddev->major_version].
+ validate_super(mddev, rdev);
+ err = mddev->pers->hot_add_disk(mddev, rdev);
+ if (err) {
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+ return err;
+ }
+ }
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (mddev->degraded)
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_new_event(mddev);
+ md_wakeup_thread(mddev->thread);
+ return 0;
+}
+
+/* words written to sysfs files may, or may not, be \n terminated.
+ * We want to accept with case. For this we use cmd_match.
+ */
+static int cmd_match(const char *cmd, const char *str)
+{
+ /* See if cmd, written into a sysfs file, matches
+ * str. They must either be the same, or cmd can
+ * have a trailing newline
+ */
+ while (*cmd && *str && *cmd == *str) {
+ cmd++;
+ str++;
+ }
+ if (*cmd == '\n')
+ cmd++;
+ if (*str || *cmd)
+ return 0;
+ return 1;
+}
+
+struct rdev_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct md_rdev *, char *);
+ ssize_t (*store)(struct md_rdev *, const char *, size_t);
+};
+
+static ssize_t
+state_show(struct md_rdev *rdev, char *page)
+{
+ char *sep = "";
+ size_t len = 0;
+ unsigned long flags = ACCESS_ONCE(rdev->flags);
+
+ if (test_bit(Faulty, &flags) ||
+ rdev->badblocks.unacked_exist) {
+ len+= sprintf(page+len, "%sfaulty",sep);
+ sep = ",";
+ }
+ if (test_bit(In_sync, &flags)) {
+ len += sprintf(page+len, "%sin_sync",sep);
+ sep = ",";
+ }
+ if (test_bit(WriteMostly, &flags)) {
+ len += sprintf(page+len, "%swrite_mostly",sep);
+ sep = ",";
+ }
+ if (test_bit(Blocked, &flags) ||
+ (rdev->badblocks.unacked_exist
+ && !test_bit(Faulty, &flags))) {
+ len += sprintf(page+len, "%sblocked", sep);
+ sep = ",";
+ }
+ if (!test_bit(Faulty, &flags) &&
+ !test_bit(In_sync, &flags)) {
+ len += sprintf(page+len, "%sspare", sep);
+ sep = ",";
+ }
+ if (test_bit(WriteErrorSeen, &flags)) {
+ len += sprintf(page+len, "%swrite_error", sep);
+ sep = ",";
+ }
+ if (test_bit(WantReplacement, &flags)) {
+ len += sprintf(page+len, "%swant_replacement", sep);
+ sep = ",";
+ }
+ if (test_bit(Replacement, &flags)) {
+ len += sprintf(page+len, "%sreplacement", sep);
+ sep = ",";
+ }
+
+ return len+sprintf(page+len, "\n");
+}
+
+static ssize_t
+state_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ /* can write
+ * faulty - simulates an error
+ * remove - disconnects the device
+ * writemostly - sets write_mostly
+ * -writemostly - clears write_mostly
+ * blocked - sets the Blocked flags
+ * -blocked - clears the Blocked and possibly simulates an error
+ * insync - sets Insync providing device isn't active
+ * -insync - clear Insync for a device with a slot assigned,
+ * so that it gets rebuilt based on bitmap
+ * write_error - sets WriteErrorSeen
+ * -write_error - clears WriteErrorSeen
+ */
+ int err = -EINVAL;
+ if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
+ md_error(rdev->mddev, rdev);
+ if (test_bit(Faulty, &rdev->flags))
+ err = 0;
+ else
+ err = -EBUSY;
+ } else if (cmd_match(buf, "remove")) {
+ if (rdev->raid_disk >= 0)
+ err = -EBUSY;
+ else {
+ struct mddev *mddev = rdev->mddev;
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->remove_disk(mddev, rdev);
+ md_kick_rdev_from_array(rdev);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+ if (mddev->pers)
+ md_update_sb(mddev, 1);
+ md_new_event(mddev);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ err = 0;
+ }
+ } else if (cmd_match(buf, "writemostly")) {
+ set_bit(WriteMostly, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "-writemostly")) {
+ clear_bit(WriteMostly, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "blocked")) {
+ set_bit(Blocked, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "-blocked")) {
+ if (!test_bit(Faulty, &rdev->flags) &&
+ rdev->badblocks.unacked_exist) {
+ /* metadata handler doesn't understand badblocks,
+ * so we need to fail the device
+ */
+ md_error(rdev->mddev, rdev);
+ }
+ clear_bit(Blocked, &rdev->flags);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
+ wake_up(&rdev->blocked_wait);
+ set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+ md_wakeup_thread(rdev->mddev->thread);
+
+ err = 0;
+ } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
+ set_bit(In_sync, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+ if (rdev->mddev->pers == NULL) {
+ clear_bit(In_sync, &rdev->flags);
+ rdev->saved_raid_disk = rdev->raid_disk;
+ rdev->raid_disk = -1;
+ err = 0;
+ }
+ } else if (cmd_match(buf, "write_error")) {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "-write_error")) {
+ clear_bit(WriteErrorSeen, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "want_replacement")) {
+ /* Any non-spare device that is not a replacement can
+ * become want_replacement at any time, but we then need to
+ * check if recovery is needed.
+ */
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Replacement, &rdev->flags))
+ set_bit(WantReplacement, &rdev->flags);
+ set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+ md_wakeup_thread(rdev->mddev->thread);
+ err = 0;
+ } else if (cmd_match(buf, "-want_replacement")) {
+ /* Clearing 'want_replacement' is always allowed.
+ * Once replacements starts it is too late though.
+ */
+ err = 0;
+ clear_bit(WantReplacement, &rdev->flags);
+ } else if (cmd_match(buf, "replacement")) {
+ /* Can only set a device as a replacement when array has not
+ * yet been started. Once running, replacement is automatic
+ * from spares, or by assigning 'slot'.
+ */
+ if (rdev->mddev->pers)
+ err = -EBUSY;
+ else {
+ set_bit(Replacement, &rdev->flags);
+ err = 0;
+ }
+ } else if (cmd_match(buf, "-replacement")) {
+ /* Similarly, can only clear Replacement before start */
+ if (rdev->mddev->pers)
+ err = -EBUSY;
+ else {
+ clear_bit(Replacement, &rdev->flags);
+ err = 0;
+ }
+ } else if (cmd_match(buf, "re-add")) {
+ if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
+ /* clear_bit is performed _after_ all the devices
+ * have their local Faulty bit cleared. If any writes
+ * happen in the meantime in the local node, they
+ * will land in the local bitmap, which will be synced
+ * by this node eventually
+ */
+ if (!mddev_is_clustered(rdev->mddev) ||
+ (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
+ clear_bit(Faulty, &rdev->flags);
+ err = add_bound_rdev(rdev);
+ }
+ } else
+ err = -EBUSY;
+ }
+ if (!err)
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ return err ? err : len;
+}
+static struct rdev_sysfs_entry rdev_state =
+__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
+
+static ssize_t
+errors_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
+}
+
+static ssize_t
+errors_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+ if (*buf && (*e == 0 || *e == '\n')) {
+ atomic_set(&rdev->corrected_errors, n);
+ return len;
+ }
+ return -EINVAL;
+}
+static struct rdev_sysfs_entry rdev_errors =
+__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
+
+static ssize_t
+slot_show(struct md_rdev *rdev, char *page)
+{
+ if (rdev->raid_disk < 0)
+ return sprintf(page, "none\n");
+ else
+ return sprintf(page, "%d\n", rdev->raid_disk);
+}
+
+static ssize_t
+slot_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ char *e;
+ int err;
+ int slot = simple_strtoul(buf, &e, 10);
+ if (strncmp(buf, "none", 4)==0)
+ slot = -1;
+ else if (e==buf || (*e && *e!= '\n'))
+ return -EINVAL;
+ if (rdev->mddev->pers && slot == -1) {
+ /* Setting 'slot' on an active array requires also
+ * updating the 'rd%d' link, and communicating
+ * with the personality with ->hot_*_disk.
+ * For now we only support removing
+ * failed/spare devices. This normally happens automatically,
+ * but not when the metadata is externally managed.
+ */
+ if (rdev->raid_disk == -1)
+ return -EEXIST;
+ /* personality does all needed checks */
+ if (rdev->mddev->pers->hot_remove_disk == NULL)
+ return -EINVAL;
+ clear_bit(Blocked, &rdev->flags);
+ remove_and_add_spares(rdev->mddev, rdev);
+ if (rdev->raid_disk >= 0)
+ return -EBUSY;
+ set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+ md_wakeup_thread(rdev->mddev->thread);
+ } else if (rdev->mddev->pers) {
+ /* Activating a spare .. or possibly reactivating
+ * if we ever get bitmaps working here.
+ */
+
+ if (rdev->raid_disk != -1)
+ return -EBUSY;
+
+ if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
+ return -EBUSY;
+
+ if (rdev->mddev->pers->hot_add_disk == NULL)
+ return -EINVAL;
+
+ if (slot >= rdev->mddev->raid_disks &&
+ slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+ return -ENOSPC;
+
+ rdev->raid_disk = slot;
+ if (test_bit(In_sync, &rdev->flags))
+ rdev->saved_raid_disk = slot;
+ else
+ rdev->saved_raid_disk = -1;
+ clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
+ err = rdev->mddev->pers->
+ hot_add_disk(rdev->mddev, rdev);
+ if (err) {
+ rdev->raid_disk = -1;
+ return err;
+ } else
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ if (sysfs_link_rdev(rdev->mddev, rdev))
+ /* failure here is OK */;
+ /* don't wakeup anyone, leave that to userspace. */
+ } else {
+ if (slot >= rdev->mddev->raid_disks &&
+ slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+ return -ENOSPC;
+ rdev->raid_disk = slot;
+ /* assume it is working */
+ clear_bit(Faulty, &rdev->flags);
+ clear_bit(WriteMostly, &rdev->flags);
+ set_bit(In_sync, &rdev->flags);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ }
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_slot =
+__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
+
+static ssize_t
+offset_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
+}
+
+static ssize_t
+offset_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned long long offset;
+ if (kstrtoull(buf, 10, &offset) < 0)
+ return -EINVAL;
+ if (rdev->mddev->pers && rdev->raid_disk >= 0)
+ return -EBUSY;
+ if (rdev->sectors && rdev->mddev->external)
+ /* Must set offset before size, so overlap checks
+ * can be sane */
+ return -EBUSY;
+ rdev->data_offset = offset;
+ rdev->new_data_offset = offset;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_offset =
+__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
+
+static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)rdev->new_data_offset);
+}
+
+static ssize_t new_offset_store(struct md_rdev *rdev,
+ const char *buf, size_t len)
+{
+ unsigned long long new_offset;
+ struct mddev *mddev = rdev->mddev;
+
+ if (kstrtoull(buf, 10, &new_offset) < 0)
+ return -EINVAL;
+
+ if (mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
+ return -EBUSY;
+ if (new_offset == rdev->data_offset)
+ /* reset is always permitted */
+ ;
+ else if (new_offset > rdev->data_offset) {
+ /* must not push array size beyond rdev_sectors */
+ if (new_offset - rdev->data_offset
+ + mddev->dev_sectors > rdev->sectors)
+ return -E2BIG;
+ }
+ /* Metadata worries about other space details. */
+
+ /* decreasing the offset is inconsistent with a backwards
+ * reshape.
+ */
+ if (new_offset < rdev->data_offset &&
+ mddev->reshape_backwards)
+ return -EINVAL;
+ /* Increasing offset is inconsistent with forwards
+ * reshape. reshape_direction should be set to
+ * 'backwards' first.
+ */
+ if (new_offset > rdev->data_offset &&
+ !mddev->reshape_backwards)
+ return -EINVAL;
+
+ if (mddev->pers && mddev->persistent &&
+ !super_types[mddev->major_version]
+ .allow_new_offset(rdev, new_offset))
+ return -E2BIG;
+ rdev->new_data_offset = new_offset;
+ if (new_offset > rdev->data_offset)
+ mddev->reshape_backwards = 1;
+ else if (new_offset < rdev->data_offset)
+ mddev->reshape_backwards = 0;
+
+ return len;
+}
+static struct rdev_sysfs_entry rdev_new_offset =
+__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
+
+static ssize_t
+rdev_size_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
+}
+
+static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
+{
+ /* check if two start/length pairs overlap */
+ if (s1+l1 <= s2)
+ return 0;
+ if (s2+l2 <= s1)
+ return 0;
+ return 1;
+}
+
+static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
+{
+ unsigned long long blocks;
+ sector_t new;
+
+ if (kstrtoull(buf, 10, &blocks) < 0)
+ return -EINVAL;
+
+ if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
+ return -EINVAL; /* sector conversion overflow */
+
+ new = blocks * 2;
+ if (new != blocks * 2)
+ return -EINVAL; /* unsigned long long to sector_t overflow */
+
+ *sectors = new;
+ return 0;
+}
+
+static ssize_t
+rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ struct mddev *my_mddev = rdev->mddev;
+ sector_t oldsectors = rdev->sectors;
+ sector_t sectors;
+
+ if (strict_blocks_to_sectors(buf, &sectors) < 0)
+ return -EINVAL;
+ if (rdev->data_offset != rdev->new_data_offset)
+ return -EINVAL; /* too confusing */
+ if (my_mddev->pers && rdev->raid_disk >= 0) {
+ if (my_mddev->persistent) {
+ sectors = super_types[my_mddev->major_version].
+ rdev_size_change(rdev, sectors);
+ if (!sectors)
+ return -EBUSY;
+ } else if (!sectors)
+ sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
+ rdev->data_offset;
+ if (!my_mddev->pers->resize)
+ /* Cannot change size for RAID0 or Linear etc */
+ return -EINVAL;
+ }
+ if (sectors < my_mddev->dev_sectors)
+ return -EINVAL; /* component must fit device */
+
+ rdev->sectors = sectors;
+ if (sectors > oldsectors && my_mddev->external) {
+ /* Need to check that all other rdevs with the same
+ * ->bdev do not overlap. 'rcu' is sufficient to walk
+ * the rdev lists safely.
+ * This check does not provide a hard guarantee, it
+ * just helps avoid dangerous mistakes.
+ */
+ struct mddev *mddev;
+ int overlap = 0;
+ struct list_head *tmp;
+
+ rcu_read_lock();
+ for_each_mddev(mddev, tmp) {
+ struct md_rdev *rdev2;
+
+ rdev_for_each(rdev2, mddev)
+ if (rdev->bdev == rdev2->bdev &&
+ rdev != rdev2 &&
+ overlaps(rdev->data_offset, rdev->sectors,
+ rdev2->data_offset,
+ rdev2->sectors)) {
+ overlap = 1;
+ break;
+ }
+ if (overlap) {
+ mddev_put(mddev);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (overlap) {
+ /* Someone else could have slipped in a size
+ * change here, but doing so is just silly.
+ * We put oldsectors back because we *know* it is
+ * safe, and trust userspace not to race with
+ * itself
+ */
+ rdev->sectors = oldsectors;
+ return -EBUSY;
+ }
+ }
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_size =
+__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
+
+static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
+{
+ unsigned long long recovery_start = rdev->recovery_offset;
+
+ if (test_bit(In_sync, &rdev->flags) ||
+ recovery_start == MaxSector)
+ return sprintf(page, "none\n");
+
+ return sprintf(page, "%llu\n", recovery_start);
+}
+
+static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned long long recovery_start;
+
+ if (cmd_match(buf, "none"))
+ recovery_start = MaxSector;
+ else if (kstrtoull(buf, 10, &recovery_start))
+ return -EINVAL;
+
+ if (rdev->mddev->pers &&
+ rdev->raid_disk >= 0)
+ return -EBUSY;
+
+ rdev->recovery_offset = recovery_start;
+ if (recovery_start == MaxSector)
+ set_bit(In_sync, &rdev->flags);
+ else
+ clear_bit(In_sync, &rdev->flags);
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_recovery_start =
+__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
+
+static ssize_t
+badblocks_show(struct badblocks *bb, char *page, int unack);
+static ssize_t
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
+
+static ssize_t bb_show(struct md_rdev *rdev, char *page)
+{
+ return badblocks_show(&rdev->badblocks, page, 0);
+}
+static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
+{
+ int rv = badblocks_store(&rdev->badblocks, page, len, 0);
+ /* Maybe that ack was all we needed */
+ if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
+ wake_up(&rdev->blocked_wait);
+ return rv;
+}
+static struct rdev_sysfs_entry rdev_bad_blocks =
+__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
+
+static ssize_t ubb_show(struct md_rdev *rdev, char *page)
+{
+ return badblocks_show(&rdev->badblocks, page, 1);
+}
+static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
+{
+ return badblocks_store(&rdev->badblocks, page, len, 1);
+}
+static struct rdev_sysfs_entry rdev_unack_bad_blocks =
+__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
+
+static struct attribute *rdev_default_attrs[] = {
+ &rdev_state.attr,
+ &rdev_errors.attr,
+ &rdev_slot.attr,
+ &rdev_offset.attr,
+ &rdev_new_offset.attr,
+ &rdev_size.attr,
+ &rdev_recovery_start.attr,
+ &rdev_bad_blocks.attr,
+ &rdev_unack_bad_blocks.attr,
+ NULL,
+};
+static ssize_t
+rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+ struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
+ struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
+
+ if (!entry->show)
+ return -EIO;
+ if (!rdev->mddev)
+ return -EBUSY;
+ return entry->show(rdev, page);
+}
+
+static ssize_t
+rdev_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t length)
+{
+ struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
+ struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
+ ssize_t rv;
+ struct mddev *mddev = rdev->mddev;
+
+ if (!entry->store)
+ return -EIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ rv = mddev ? mddev_lock(mddev): -EBUSY;
+ if (!rv) {
+ if (rdev->mddev == NULL)
+ rv = -EBUSY;
+ else
+ rv = entry->store(rdev, page, length);
+ mddev_unlock(mddev);
+ }
+ return rv;
+}
+
+static void rdev_free(struct kobject *ko)
+{
+ struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
+ kfree(rdev);
+}
+static const struct sysfs_ops rdev_sysfs_ops = {
+ .show = rdev_attr_show,
+ .store = rdev_attr_store,
+};
+static struct kobj_type rdev_ktype = {
+ .release = rdev_free,
+ .sysfs_ops = &rdev_sysfs_ops,
+ .default_attrs = rdev_default_attrs,
+};
+
+int md_rdev_init(struct md_rdev *rdev)
+{
+ rdev->desc_nr = -1;
+ rdev->saved_raid_disk = -1;
+ rdev->raid_disk = -1;
+ rdev->flags = 0;
+ rdev->data_offset = 0;
+ rdev->new_data_offset = 0;
+ rdev->sb_events = 0;
+ rdev->last_read_error.tv_sec = 0;
+ rdev->last_read_error.tv_nsec = 0;
+ rdev->sb_loaded = 0;
+ rdev->bb_page = NULL;
+ atomic_set(&rdev->nr_pending, 0);
+ atomic_set(&rdev->read_errors, 0);
+ atomic_set(&rdev->corrected_errors, 0);
+
+ INIT_LIST_HEAD(&rdev->same_set);
+ init_waitqueue_head(&rdev->blocked_wait);
+
+ /* Add space to store bad block list.
+ * This reserves the space even on arrays where it cannot
+ * be used - I wonder if that matters
+ */
+ rdev->badblocks.count = 0;
+ rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
+ rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ seqlock_init(&rdev->badblocks.lock);
+ if (rdev->badblocks.page == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(md_rdev_init);
+/*
+ * Import a device. If 'super_format' >= 0, then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
+{
+ char b[BDEVNAME_SIZE];
+ int err;
+ struct md_rdev *rdev;
+ sector_t size;
+
+ rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for new device!\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = md_rdev_init(rdev);
+ if (err)
+ goto abort_free;
+ err = alloc_disk_sb(rdev);
+ if (err)
+ goto abort_free;
+
+ err = lock_rdev(rdev, newdev, super_format == -2);
+ if (err)
+ goto abort_free;
+
+ kobject_init(&rdev->kobj, &rdev_ktype);
+
+ size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
+ if (!size) {
+ printk(KERN_WARNING
+ "md: %s has zero or unknown size, marking faulty!\n",
+ bdevname(rdev->bdev,b));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (super_format >= 0) {
+ err = super_types[super_format].
+ load_super(rdev, NULL, super_minor);
+ if (err == -EINVAL) {
+ printk(KERN_WARNING
+ "md: %s does not have a valid v%d.%d "
+ "superblock, not importing!\n",
+ bdevname(rdev->bdev,b),
+ super_format, super_minor);
+ goto abort_free;
+ }
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: could not read %s's sb, not importing!\n",
+ bdevname(rdev->bdev,b));
+ goto abort_free;
+ }
+ }
+
+ return rdev;
+
+abort_free:
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ md_rdev_clear(rdev);
+ kfree(rdev);
+ return ERR_PTR(err);
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+static void analyze_sbs(struct mddev *mddev)
+{
+ int i;
+ struct md_rdev *rdev, *freshest, *tmp;
+ char b[BDEVNAME_SIZE];
+
+ freshest = NULL;
+ rdev_for_each_safe(rdev, tmp, mddev)
+ switch (super_types[mddev->major_version].
+ load_super(rdev, freshest, mddev->minor_version)) {
+ case 1:
+ freshest = rdev;
+ break;
+ case 0:
+ break;
+ default:
+ printk( KERN_ERR \
+ "md: fatal superblock inconsistency in %s"
+ " -- removing from array\n",
+ bdevname(rdev->bdev,b));
+ md_kick_rdev_from_array(rdev);
+ }
+
+ super_types[mddev->major_version].
+ validate_super(mddev, freshest);
+
+ i = 0;
+ rdev_for_each_safe(rdev, tmp, mddev) {
+ if (mddev->max_disks &&
+ (rdev->desc_nr >= mddev->max_disks ||
+ i > mddev->max_disks)) {
+ printk(KERN_WARNING
+ "md: %s: %s: only %d devices permitted\n",
+ mdname(mddev), bdevname(rdev->bdev, b),
+ mddev->max_disks);
+ md_kick_rdev_from_array(rdev);
+ continue;
+ }
+ if (rdev != freshest) {
+ if (super_types[mddev->major_version].
+ validate_super(mddev, rdev)) {
+ printk(KERN_WARNING "md: kicking non-fresh %s"
+ " from array!\n",
+ bdevname(rdev->bdev,b));
+ md_kick_rdev_from_array(rdev);
+ continue;
+ }
+ /* No device should have a Candidate flag
+ * when reading devices
+ */
+ if (test_bit(Candidate, &rdev->flags)) {
+ pr_info("md: kicking Cluster Candidate %s from array!\n",
+ bdevname(rdev->bdev, b));
+ md_kick_rdev_from_array(rdev);
+ }
+ }
+ if (mddev->level == LEVEL_MULTIPATH) {
+ rdev->desc_nr = i++;
+ rdev->raid_disk = rdev->desc_nr;
+ set_bit(In_sync, &rdev->flags);
+ } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
+ rdev->raid_disk = -1;
+ clear_bit(In_sync, &rdev->flags);
+ }
+ }
+}
+
+/* Read a fixed-point number.
+ * Numbers in sysfs attributes should be in "standard" units where
+ * possible, so time should be in seconds.
+ * However we internally use a a much smaller unit such as
+ * milliseconds or jiffies.
+ * This function takes a decimal number with a possible fractional
+ * component, and produces an integer which is the result of
+ * multiplying that number by 10^'scale'.
+ * all without any floating-point arithmetic.
+ */
+int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
+{
+ unsigned long result = 0;
+ long decimals = -1;
+ while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
+ if (*cp == '.')
+ decimals = 0;
+ else if (decimals < scale) {
+ unsigned int value;
+ value = *cp - '0';
+ result = result * 10 + value;
+ if (decimals >= 0)
+ decimals++;
+ }
+ cp++;
+ }
+ if (*cp == '\n')
+ cp++;
+ if (*cp)
+ return -EINVAL;
+ if (decimals < 0)
+ decimals = 0;
+ while (decimals < scale) {
+ result *= 10;
+ decimals ++;
+ }
+ *res = result;
+ return 0;
+}
+
+static void md_safemode_timeout(unsigned long data);
+
+static ssize_t
+safe_delay_show(struct mddev *mddev, char *page)
+{
+ int msec = (mddev->safemode_delay*1000)/HZ;
+ return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
+}
+static ssize_t
+safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
+{
+ unsigned long msec;
+
+ if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
+ return -EINVAL;
+ if (msec == 0)
+ mddev->safemode_delay = 0;
+ else {
+ unsigned long old_delay = mddev->safemode_delay;
+ unsigned long new_delay = (msec*HZ)/1000;
+
+ if (new_delay == 0)
+ new_delay = 1;
+ mddev->safemode_delay = new_delay;
+ if (new_delay < old_delay || old_delay == 0)
+ mod_timer(&mddev->safemode_timer, jiffies+1);
+ }
+ return len;
+}
+static struct md_sysfs_entry md_safe_delay =
+__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
+
+static ssize_t
+level_show(struct mddev *mddev, char *page)
+{
+ struct md_personality *p;
+ int ret;
+ spin_lock(&mddev->lock);
+ p = mddev->pers;
+ if (p)
+ ret = sprintf(page, "%s\n", p->name);
+ else if (mddev->clevel[0])
+ ret = sprintf(page, "%s\n", mddev->clevel);
+ else if (mddev->level != LEVEL_NONE)
+ ret = sprintf(page, "%d\n", mddev->level);
+ else
+ ret = 0;
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+static ssize_t
+level_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ char clevel[16];
+ ssize_t rv;
+ size_t slen = len;
+ struct md_personality *pers, *oldpers;
+ long level;
+ void *priv, *oldpriv;
+ struct md_rdev *rdev;
+
+ if (slen == 0 || slen >= sizeof(clevel))
+ return -EINVAL;
+
+ rv = mddev_lock(mddev);
+ if (rv)
+ return rv;
+
+ if (mddev->pers == NULL) {
+ strncpy(mddev->clevel, buf, slen);
+ if (mddev->clevel[slen-1] == '\n')
+ slen--;
+ mddev->clevel[slen] = 0;
+ mddev->level = LEVEL_NONE;
+ rv = len;
+ goto out_unlock;
+ }
+ rv = -EROFS;
+ if (mddev->ro)
+ goto out_unlock;
+
+ /* request to change the personality. Need to ensure:
+ * - array is not engaged in resync/recovery/reshape
+ * - old personality can be suspended
+ * - new personality will access other array.
+ */
+
+ rv = -EBUSY;
+ if (mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ mddev->reshape_position != MaxSector ||
+ mddev->sysfs_active)
+ goto out_unlock;
+
+ rv = -EINVAL;
+ if (!mddev->pers->quiesce) {
+ printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
+ mdname(mddev), mddev->pers->name);
+ goto out_unlock;
+ }
+
+ /* Now find the new personality */
+ strncpy(clevel, buf, slen);
+ if (clevel[slen-1] == '\n')
+ slen--;
+ clevel[slen] = 0;
+ if (kstrtol(clevel, 10, &level))
+ level = LEVEL_NONE;
+
+ if (request_module("md-%s", clevel) != 0)
+ request_module("md-level-%s", clevel);
+ spin_lock(&pers_lock);
+ pers = find_pers(level, clevel);
+ if (!pers || !try_module_get(pers->owner)) {
+ spin_unlock(&pers_lock);
+ printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
+ rv = -EINVAL;
+ goto out_unlock;
+ }
+ spin_unlock(&pers_lock);
+
+ if (pers == mddev->pers) {
+ /* Nothing to do! */
+ module_put(pers->owner);
+ rv = len;
+ goto out_unlock;
+ }
+ if (!pers->takeover) {
+ module_put(pers->owner);
+ printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
+ mdname(mddev), clevel);
+ rv = -EINVAL;
+ goto out_unlock;
+ }
+
+ rdev_for_each(rdev, mddev)
+ rdev->new_raid_disk = rdev->raid_disk;
+
+ /* ->takeover must set new_* and/or delta_disks
+ * if it succeeds, and may set them when it fails.
+ */
+ priv = pers->takeover(mddev);
+ if (IS_ERR(priv)) {
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->raid_disks -= mddev->delta_disks;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ module_put(pers->owner);
+ printk(KERN_WARNING "md: %s: %s would not accept array\n",
+ mdname(mddev), clevel);
+ rv = PTR_ERR(priv);
+ goto out_unlock;
+ }
+
+ /* Looks like we have a winner */
+ mddev_suspend(mddev);
+ mddev_detach(mddev);
+
+ spin_lock(&mddev->lock);
+ oldpers = mddev->pers;
+ oldpriv = mddev->private;
+ mddev->pers = pers;
+ mddev->private = priv;
+ strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ mddev->level = mddev->new_level;
+ mddev->layout = mddev->new_layout;
+ mddev->chunk_sectors = mddev->new_chunk_sectors;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ mddev->degraded = 0;
+ spin_unlock(&mddev->lock);
+
+ if (oldpers->sync_request == NULL &&
+ mddev->external) {
+ /* We are converting from a no-redundancy array
+ * to a redundancy array and metadata is managed
+ * externally so we need to be sure that writes
+ * won't block due to a need to transition
+ * clean->dirty
+ * until external management is started.
+ */
+ mddev->in_sync = 0;
+ mddev->safemode_delay = 0;
+ mddev->safemode = 0;
+ }
+
+ oldpers->free(mddev, oldpriv);
+
+ if (oldpers->sync_request == NULL &&
+ pers->sync_request != NULL) {
+ /* need to add the md_redundancy_group */
+ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
+ printk(KERN_WARNING
+ "md: cannot register extra attributes for %s\n",
+ mdname(mddev));
+ mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+ }
+ if (oldpers->sync_request != NULL &&
+ pers->sync_request == NULL) {
+ /* need to remove the md_redundancy_group */
+ if (mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+ }
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk < 0)
+ continue;
+ if (rdev->new_raid_disk >= mddev->raid_disks)
+ rdev->new_raid_disk = -1;
+ if (rdev->new_raid_disk == rdev->raid_disk)
+ continue;
+ sysfs_unlink_rdev(mddev, rdev);
+ }
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk < 0)
+ continue;
+ if (rdev->new_raid_disk == rdev->raid_disk)
+ continue;
+ rdev->raid_disk = rdev->new_raid_disk;
+ if (rdev->raid_disk < 0)
+ clear_bit(In_sync, &rdev->flags);
+ else {
+ if (sysfs_link_rdev(mddev, rdev))
+ printk(KERN_WARNING "md: cannot register rd%d"
+ " for %s after level change\n",
+ rdev->raid_disk, mdname(mddev));
+ }
+ }
+
+ if (pers->sync_request == NULL) {
+ /* this is now an array without redundancy, so
+ * it must always be in_sync
+ */
+ mddev->in_sync = 1;
+ del_timer_sync(&mddev->safemode_timer);
+ }
+ blk_set_stacking_limits(&mddev->queue->limits);
+ pers->run(mddev);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ mddev_resume(mddev);
+ if (!mddev->thread)
+ md_update_sb(mddev, 1);
+ sysfs_notify(&mddev->kobj, NULL, "level");
+ md_new_event(mddev);
+ rv = len;
+out_unlock:
+ mddev_unlock(mddev);
+ return rv;
+}
+
+static struct md_sysfs_entry md_level =
+__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
+
+static ssize_t
+layout_show(struct mddev *mddev, char *page)
+{
+ /* just a number, not meaningful for all levels */
+ if (mddev->reshape_position != MaxSector &&
+ mddev->layout != mddev->new_layout)
+ return sprintf(page, "%d (%d)\n",
+ mddev->new_layout, mddev->layout);
+ return sprintf(page, "%d\n", mddev->layout);
+}
+
+static ssize_t
+layout_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+ int err;
+
+ if (!*buf || (*e && *e != '\n'))
+ return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ if (mddev->pers) {
+ if (mddev->pers->check_reshape == NULL)
+ err = -EBUSY;
+ else if (mddev->ro)
+ err = -EROFS;
+ else {
+ mddev->new_layout = n;
+ err = mddev->pers->check_reshape(mddev);
+ if (err)
+ mddev->new_layout = mddev->layout;
+ }
+ } else {
+ mddev->new_layout = n;
+ if (mddev->reshape_position == MaxSector)
+ mddev->layout = n;
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+static struct md_sysfs_entry md_layout =
+__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
+
+static ssize_t
+raid_disks_show(struct mddev *mddev, char *page)
+{
+ if (mddev->raid_disks == 0)
+ return 0;
+ if (mddev->reshape_position != MaxSector &&
+ mddev->delta_disks != 0)
+ return sprintf(page, "%d (%d)\n", mddev->raid_disks,
+ mddev->raid_disks - mddev->delta_disks);
+ return sprintf(page, "%d\n", mddev->raid_disks);
+}
+
+static int update_raid_disks(struct mddev *mddev, int raid_disks);
+
+static ssize_t
+raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ char *e;
+ int err;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+
+ if (!*buf || (*e && *e != '\n'))
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ if (mddev->pers)
+ err = update_raid_disks(mddev, n);
+ else if (mddev->reshape_position != MaxSector) {
+ struct md_rdev *rdev;
+ int olddisks = mddev->raid_disks - mddev->delta_disks;
+
+ err = -EINVAL;
+ rdev_for_each(rdev, mddev) {
+ if (olddisks < n &&
+ rdev->data_offset < rdev->new_data_offset)
+ goto out_unlock;
+ if (olddisks > n &&
+ rdev->data_offset > rdev->new_data_offset)
+ goto out_unlock;
+ }
+ err = 0;
+ mddev->delta_disks = n - olddisks;
+ mddev->raid_disks = n;
+ mddev->reshape_backwards = (mddev->delta_disks < 0);
+ } else
+ mddev->raid_disks = n;
+out_unlock:
+ mddev_unlock(mddev);
+ return err ? err : len;
+}
+static struct md_sysfs_entry md_raid_disks =
+__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
+
+static ssize_t
+chunk_size_show(struct mddev *mddev, char *page)
+{
+ if (mddev->reshape_position != MaxSector &&
+ mddev->chunk_sectors != mddev->new_chunk_sectors)
+ return sprintf(page, "%d (%d)\n",
+ mddev->new_chunk_sectors << 9,
+ mddev->chunk_sectors << 9);
+ return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
+}
+
+static ssize_t
+chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int err;
+ char *e;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+
+ if (!*buf || (*e && *e != '\n'))
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ if (mddev->pers) {
+ if (mddev->pers->check_reshape == NULL)
+ err = -EBUSY;
+ else if (mddev->ro)
+ err = -EROFS;
+ else {
+ mddev->new_chunk_sectors = n >> 9;
+ err = mddev->pers->check_reshape(mddev);
+ if (err)
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ }
+ } else {
+ mddev->new_chunk_sectors = n >> 9;
+ if (mddev->reshape_position == MaxSector)
+ mddev->chunk_sectors = n >> 9;
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+static struct md_sysfs_entry md_chunk_size =
+__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
+
+static ssize_t
+resync_start_show(struct mddev *mddev, char *page)
+{
+ if (mddev->recovery_cp == MaxSector)
+ return sprintf(page, "none\n");
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+}
+
+static ssize_t
+resync_start_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int err;
+ char *e;
+ unsigned long long n = simple_strtoull(buf, &e, 10);
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ err = -EBUSY;
+ else if (cmd_match(buf, "none"))
+ n = MaxSector;
+ else if (!*buf || (*e && *e != '\n'))
+ err = -EINVAL;
+
+ if (!err) {
+ mddev->recovery_cp = n;
+ if (mddev->pers)
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+static struct md_sysfs_entry md_resync_start =
+__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
+ resync_start_show, resync_start_store);
+
+/*
+ * The array state can be:
+ *
+ * clear
+ * No devices, no size, no level
+ * Equivalent to STOP_ARRAY ioctl
+ * inactive
+ * May have some settings, but array is not active
+ * all IO results in error
+ * When written, doesn't tear down array, but just stops it
+ * suspended (not supported yet)
+ * All IO requests will block. The array can be reconfigured.
+ * Writing this, if accepted, will block until array is quiescent
+ * readonly
+ * no resync can happen. no superblocks get written.
+ * write requests fail
+ * read-auto
+ * like readonly, but behaves like 'clean' on a write request.
+ *
+ * clean - no pending writes, but otherwise active.
+ * When written to inactive array, starts without resync
+ * If a write request arrives then
+ * if metadata is known, mark 'dirty' and switch to 'active'.
+ * if not known, block and switch to write-pending
+ * If written to an active array that has pending writes, then fails.
+ * active
+ * fully active: IO and resync can be happening.
+ * When written to inactive array, starts with resync
+ *
+ * write-pending
+ * clean, but writes are blocked waiting for 'active' to be written.
+ *
+ * active-idle
+ * like active, but no writes have been seen for a while (100msec).
+ *
+ */
+enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
+ write_pending, active_idle, bad_word};
+static char *array_states[] = {
+ "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
+ "write-pending", "active-idle", NULL };
+
+static int match_word(const char *word, char **list)
+{
+ int n;
+ for (n=0; list[n]; n++)
+ if (cmd_match(word, list[n]))
+ break;
+ return n;
+}
+
+static ssize_t
+array_state_show(struct mddev *mddev, char *page)
+{
+ enum array_state st = inactive;
+
+ if (mddev->pers)
+ switch(mddev->ro) {
+ case 1:
+ st = readonly;
+ break;
+ case 2:
+ st = read_auto;
+ break;
+ case 0:
+ if (mddev->in_sync)
+ st = clean;
+ else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
+ st = write_pending;
+ else if (mddev->safemode)
+ st = active_idle;
+ else
+ st = active;
+ }
+ else {
+ if (list_empty(&mddev->disks) &&
+ mddev->raid_disks == 0 &&
+ mddev->dev_sectors == 0)
+ st = clear;
+ else
+ st = inactive;
+ }
+ return sprintf(page, "%s\n", array_states[st]);
+}
+
+static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
+static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
+static int do_md_run(struct mddev *mddev);
+static int restart_array(struct mddev *mddev);
+
+static ssize_t
+array_state_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int err;
+ enum array_state st = match_word(buf, array_states);
+
+ if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
+ /* don't take reconfig_mutex when toggling between
+ * clean and active
+ */
+ spin_lock(&mddev->lock);
+ if (st == active) {
+ restart_array(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ err = 0;
+ } else /* st == clean */ {
+ restart_array(mddev);
+ if (atomic_read(&mddev->writes_pending) == 0) {
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ err = 0;
+ } else
+ err = -EBUSY;
+ }
+ spin_unlock(&mddev->lock);
+ return err ?: len;
+ }
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ switch(st) {
+ case bad_word:
+ break;
+ case clear:
+ /* stopping an active array */
+ err = do_md_stop(mddev, 0, NULL);
+ break;
+ case inactive:
+ /* stopping an active array */
+ if (mddev->pers)
+ err = do_md_stop(mddev, 2, NULL);
+ else
+ err = 0; /* already inactive */
+ break;
+ case suspended:
+ break; /* not supported yet */
+ case readonly:
+ if (mddev->pers)
+ err = md_set_readonly(mddev, NULL);
+ else {
+ mddev->ro = 1;
+ set_disk_ro(mddev->gendisk, 1);
+ err = do_md_run(mddev);
+ }
+ break;
+ case read_auto:
+ if (mddev->pers) {
+ if (mddev->ro == 0)
+ err = md_set_readonly(mddev, NULL);
+ else if (mddev->ro == 1)
+ err = restart_array(mddev);
+ if (err == 0) {
+ mddev->ro = 2;
+ set_disk_ro(mddev->gendisk, 0);
+ }
+ } else {
+ mddev->ro = 2;
+ err = do_md_run(mddev);
+ }
+ break;
+ case clean:
+ if (mddev->pers) {
+ restart_array(mddev);
+ spin_lock(&mddev->lock);
+ if (atomic_read(&mddev->writes_pending) == 0) {
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ err = 0;
+ } else
+ err = -EBUSY;
+ spin_unlock(&mddev->lock);
+ } else
+ err = -EINVAL;
+ break;
+ case active:
+ if (mddev->pers) {
+ restart_array(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ err = 0;
+ } else {
+ mddev->ro = 0;
+ set_disk_ro(mddev->gendisk, 0);
+ err = do_md_run(mddev);
+ }
+ break;
+ case write_pending:
+ case active_idle:
+ /* these cannot be set */
+ break;
+ }
+
+ if (!err) {
+ if (mddev->hold_active == UNTIL_IOCTL)
+ mddev->hold_active = 0;
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+static struct md_sysfs_entry md_array_state =
+__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
+
+static ssize_t
+max_corrected_read_errors_show(struct mddev *mddev, char *page) {
+ return sprintf(page, "%d\n",
+ atomic_read(&mddev->max_corr_read_errors));
+}
+
+static ssize_t
+max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+
+ if (*buf && (*e == 0 || *e == '\n')) {
+ atomic_set(&mddev->max_corr_read_errors, n);
+ return len;
+ }
+ return -EINVAL;
+}
+
+static struct md_sysfs_entry max_corr_read_errors =
+__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
+ max_corrected_read_errors_store);
+
+static ssize_t
+null_show(struct mddev *mddev, char *page)
+{
+ return -EINVAL;
+}
+
+static ssize_t
+new_dev_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ /* buf must be %d:%d\n? giving major and minor numbers */
+ /* The new device is added to the array.
+ * If the array has a persistent superblock, we read the
+ * superblock to initialise info and check validity.
+ * Otherwise, only checking done is that in bind_rdev_to_array,
+ * which mainly checks size.
+ */
+ char *e;
+ int major = simple_strtoul(buf, &e, 10);
+ int minor;
+ dev_t dev;
+ struct md_rdev *rdev;
+ int err;
+
+ if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
+ return -EINVAL;
+ minor = simple_strtoul(e+1, &e, 10);
+ if (*e && *e != '\n')
+ return -EINVAL;
+ dev = MKDEV(major, minor);
+ if (major != MAJOR(dev) ||
+ minor != MINOR(dev))
+ return -EOVERFLOW;
+
+ flush_workqueue(md_misc_wq);
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ if (mddev->persistent) {
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
+ struct md_rdev *rdev0
+ = list_entry(mddev->disks.next,
+ struct md_rdev, same_set);
+ err = super_types[mddev->major_version]
+ .load_super(rdev, rdev0, mddev->minor_version);
+ if (err < 0)
+ goto out;
+ }
+ } else if (mddev->external)
+ rdev = md_import_device(dev, -2, -1);
+ else
+ rdev = md_import_device(dev, -1, -1);
+
+ if (IS_ERR(rdev)) {
+ mddev_unlock(mddev);
+ return PTR_ERR(rdev);
+ }
+ err = bind_rdev_to_array(rdev, mddev);
+ out:
+ if (err)
+ export_rdev(rdev);
+ mddev_unlock(mddev);
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_new_device =
+__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
+
+static ssize_t
+bitmap_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ char *end;
+ unsigned long chunk, end_chunk;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ if (!mddev->bitmap)
+ goto out;
+ /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
+ while (*buf) {
+ chunk = end_chunk = simple_strtoul(buf, &end, 0);
+ if (buf == end) break;
+ if (*end == '-') { /* range */
+ buf = end + 1;
+ end_chunk = simple_strtoul(buf, &end, 0);
+ if (buf == end) break;
+ }
+ if (*end && !isspace(*end)) break;
+ bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
+ buf = skip_spaces(end);
+ }
+ bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
+out:
+ mddev_unlock(mddev);
+ return len;
+}
+
+static struct md_sysfs_entry md_bitmap =
+__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
+
+static ssize_t
+size_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->dev_sectors / 2);
+}
+
+static int update_size(struct mddev *mddev, sector_t num_sectors);
+
+static ssize_t
+size_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ /* If array is inactive, we can reduce the component size, but
+ * not increase it (except from 0).
+ * If array is active, we can try an on-line resize
+ */
+ sector_t sectors;
+ int err = strict_blocks_to_sectors(buf, &sectors);
+
+ if (err < 0)
+ return err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ if (mddev->pers) {
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+ err = update_size(mddev, sectors);
+ md_update_sb(mddev, 1);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ } else {
+ if (mddev->dev_sectors == 0 ||
+ mddev->dev_sectors > sectors)
+ mddev->dev_sectors = sectors;
+ else
+ err = -ENOSPC;
+ }
+ mddev_unlock(mddev);
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_size =
+__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
+
+/* Metadata version.
+ * This is one of
+ * 'none' for arrays with no metadata (good luck...)
+ * 'external' for arrays with externally managed metadata,
+ * or N.M for internally known formats
+ */
+static ssize_t
+metadata_show(struct mddev *mddev, char *page)
+{
+ if (mddev->persistent)
+ return sprintf(page, "%d.%d\n",
+ mddev->major_version, mddev->minor_version);
+ else if (mddev->external)
+ return sprintf(page, "external:%s\n", mddev->metadata_type);
+ else
+ return sprintf(page, "none\n");
+}
+
+static ssize_t
+metadata_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int major, minor;
+ char *e;
+ int err;
+ /* Changing the details of 'external' metadata is
+ * always permitted. Otherwise there must be
+ * no devices attached to the array.
+ */
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EBUSY;
+ if (mddev->external && strncmp(buf, "external:", 9) == 0)
+ ;
+ else if (!list_empty(&mddev->disks))
+ goto out_unlock;
+
+ err = 0;
+ if (cmd_match(buf, "none")) {
+ mddev->persistent = 0;
+ mddev->external = 0;
+ mddev->major_version = 0;
+ mddev->minor_version = 90;
+ goto out_unlock;
+ }
+ if (strncmp(buf, "external:", 9) == 0) {
+ size_t namelen = len-9;
+ if (namelen >= sizeof(mddev->metadata_type))
+ namelen = sizeof(mddev->metadata_type)-1;
+ strncpy(mddev->metadata_type, buf+9, namelen);
+ mddev->metadata_type[namelen] = 0;
+ if (namelen && mddev->metadata_type[namelen-1] == '\n')
+ mddev->metadata_type[--namelen] = 0;
+ mddev->persistent = 0;
+ mddev->external = 1;
+ mddev->major_version = 0;
+ mddev->minor_version = 90;
+ goto out_unlock;
+ }
+ major = simple_strtoul(buf, &e, 10);
+ err = -EINVAL;
+ if (e==buf || *e != '.')
+ goto out_unlock;
+ buf = e+1;
+ minor = simple_strtoul(buf, &e, 10);
+ if (e==buf || (*e && *e != '\n') )
+ goto out_unlock;
+ err = -ENOENT;
+ if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
+ goto out_unlock;
+ mddev->major_version = major;
+ mddev->minor_version = minor;
+ mddev->persistent = 1;
+ mddev->external = 0;
+ err = 0;
+out_unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_metadata =
+__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+
+static ssize_t
+action_show(struct mddev *mddev, char *page)
+{
+ char *type = "idle";
+ unsigned long recovery = mddev->recovery;
+ if (test_bit(MD_RECOVERY_FROZEN, &recovery))
+ type = "frozen";
+ else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
+ (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
+ type = "reshape";
+ else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
+ type = "resync";
+ else if (test_bit(MD_RECOVERY_CHECK, &recovery))
+ type = "check";
+ else
+ type = "repair";
+ } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
+ type = "recover";
+ }
+ return sprintf(page, "%s\n", type);
+}
+
+static ssize_t
+action_store(struct mddev *mddev, const char *page, size_t len)
+{
+ if (!mddev->pers || !mddev->pers->sync_request)
+ return -EINVAL;
+
+
+ if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
+ if (cmd_match(page, "frozen"))
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ else
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ mddev_lock(mddev) == 0) {
+ flush_workqueue(md_misc_wq);
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_reap_sync_thread(mddev);
+ }
+ mddev_unlock(mddev);
+ }
+ } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ return -EBUSY;
+ else if (cmd_match(page, "resync"))
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ else if (cmd_match(page, "recover")) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ } else if (cmd_match(page, "reshape")) {
+ int err;
+ if (mddev->pers->start_reshape == NULL)
+ return -EINVAL;
+ err = mddev_lock(mddev);
+ if (!err) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ err = mddev->pers->start_reshape(mddev);
+ mddev_unlock(mddev);
+ }
+ if (err)
+ return err;
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
+ } else {
+ if (cmd_match(page, "check"))
+ set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ else if (!cmd_match(page, "repair"))
+ return -EINVAL;
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ }
+ if (mddev->ro == 2) {
+ /* A write to sync_action is enough to justify
+ * canceling read-auto mode
+ */
+ mddev->ro = 0;
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ return len;
+}
+
+static struct md_sysfs_entry md_scan_mode =
+__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+
+static ssize_t
+last_sync_action_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%s\n", mddev->last_sync_action);
+}
+
+static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
+
+static ssize_t
+mismatch_cnt_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)
+ atomic64_read(&mddev->resync_mismatches));
+}
+
+static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
+
+static ssize_t
+sync_min_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", speed_min(mddev),
+ mddev->sync_speed_min ? "local": "system");
+}
+
+static ssize_t
+sync_min_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int min;
+ char *e;
+ if (strncmp(buf, "system", 6)==0) {
+ mddev->sync_speed_min = 0;
+ return len;
+ }
+ min = simple_strtoul(buf, &e, 10);
+ if (buf == e || (*e && *e != '\n') || min <= 0)
+ return -EINVAL;
+ mddev->sync_speed_min = min;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_min =
+__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
+
+static ssize_t
+sync_max_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", speed_max(mddev),
+ mddev->sync_speed_max ? "local": "system");
+}
+
+static ssize_t
+sync_max_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int max;
+ char *e;
+ if (strncmp(buf, "system", 6)==0) {
+ mddev->sync_speed_max = 0;
+ return len;
+ }
+ max = simple_strtoul(buf, &e, 10);
+ if (buf == e || (*e && *e != '\n') || max <= 0)
+ return -EINVAL;
+ mddev->sync_speed_max = max;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_max =
+__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
+
+static ssize_t
+degraded_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->degraded);
+}
+static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
+
+static ssize_t
+sync_force_parallel_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->parallel_resync);
+}
+
+static ssize_t
+sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ long n;
+
+ if (kstrtol(buf, 10, &n))
+ return -EINVAL;
+
+ if (n != 0 && n != 1)
+ return -EINVAL;
+
+ mddev->parallel_resync = n;
+
+ if (mddev->sync_thread)
+ wake_up(&resync_wait);
+
+ return len;
+}
+
+/* force parallel resync, even with shared block devices */
+static struct md_sysfs_entry md_sync_force_parallel =
+__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
+ sync_force_parallel_show, sync_force_parallel_store);
+
+static ssize_t
+sync_speed_show(struct mddev *mddev, char *page)
+{
+ unsigned long resync, dt, db;
+ if (mddev->curr_resync == 0)
+ return sprintf(page, "none\n");
+ resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
+ dt = (jiffies - mddev->resync_mark) / HZ;
+ if (!dt) dt++;
+ db = resync - mddev->resync_mark_cnt;
+ return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
+}
+
+static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
+
+static ssize_t
+sync_completed_show(struct mddev *mddev, char *page)
+{
+ unsigned long long max_sectors, resync;
+
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return sprintf(page, "none\n");
+
+ if (mddev->curr_resync == 1 ||
+ mddev->curr_resync == 2)
+ return sprintf(page, "delayed\n");
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ max_sectors = mddev->resync_max_sectors;
+ else
+ max_sectors = mddev->dev_sectors;
+
+ resync = mddev->curr_resync_completed;
+ return sprintf(page, "%llu / %llu\n", resync, max_sectors);
+}
+
+static struct md_sysfs_entry md_sync_completed =
+ __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
+
+static ssize_t
+min_sync_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->resync_min);
+}
+static ssize_t
+min_sync_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned long long min;
+ int err;
+
+ if (kstrtoull(buf, 10, &min))
+ return -EINVAL;
+
+ spin_lock(&mddev->lock);
+ err = -EINVAL;
+ if (min > mddev->resync_max)
+ goto out_unlock;
+
+ err = -EBUSY;
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ goto out_unlock;
+
+ /* Round down to multiple of 4K for safety */
+ mddev->resync_min = round_down(min, 8);
+ err = 0;
+
+out_unlock:
+ spin_unlock(&mddev->lock);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_min_sync =
+__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
+
+static ssize_t
+max_sync_show(struct mddev *mddev, char *page)
+{
+ if (mddev->resync_max == MaxSector)
+ return sprintf(page, "max\n");
+ else
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->resync_max);
+}
+static ssize_t
+max_sync_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int err;
+ spin_lock(&mddev->lock);
+ if (strncmp(buf, "max", 3) == 0)
+ mddev->resync_max = MaxSector;
+ else {
+ unsigned long long max;
+ int chunk;
+
+ err = -EINVAL;
+ if (kstrtoull(buf, 10, &max))
+ goto out_unlock;
+ if (max < mddev->resync_min)
+ goto out_unlock;
+
+ err = -EBUSY;
+ if (max < mddev->resync_max &&
+ mddev->ro == 0 &&
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ goto out_unlock;
+
+ /* Must be a multiple of chunk_size */
+ chunk = mddev->chunk_sectors;
+ if (chunk) {
+ sector_t temp = max;
+
+ err = -EINVAL;
+ if (sector_div(temp, chunk))
+ goto out_unlock;
+ }
+ mddev->resync_max = max;
+ }
+ wake_up(&mddev->recovery_wait);
+ err = 0;
+out_unlock:
+ spin_unlock(&mddev->lock);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_max_sync =
+__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
+
+static ssize_t
+suspend_lo_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+}
+
+static ssize_t
+suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old;
+ int err;
+
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
+ goto unlock;
+ old = mddev->suspend_lo;
+ mddev->suspend_lo = new;
+ if (new >= old)
+ /* Shrinking suspended region */
+ mddev->pers->quiesce(mddev, 2);
+ else {
+ /* Expanding suspended region - need to wait */
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+static struct md_sysfs_entry md_suspend_lo =
+__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
+
+static ssize_t
+suspend_hi_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+}
+
+static ssize_t
+suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old;
+ int err;
+
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
+ goto unlock;
+ old = mddev->suspend_hi;
+ mddev->suspend_hi = new;
+ if (new <= old)
+ /* Shrinking suspended region */
+ mddev->pers->quiesce(mddev, 2);
+ else {
+ /* Expanding suspended region - need to wait */
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+static struct md_sysfs_entry md_suspend_hi =
+__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
+
+static ssize_t
+reshape_position_show(struct mddev *mddev, char *page)
+{
+ if (mddev->reshape_position != MaxSector)
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->reshape_position);
+ strcpy(page, "none\n");
+ return 5;
+}
+
+static ssize_t
+reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ struct md_rdev *rdev;
+ char *e;
+ int err;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EBUSY;
+ if (mddev->pers)
+ goto unlock;
+ mddev->reshape_position = new;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ rdev_for_each(rdev, mddev)
+ rdev->new_data_offset = rdev->data_offset;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_reshape_position =
+__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
+ reshape_position_store);
+
+static ssize_t
+reshape_direction_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%s\n",
+ mddev->reshape_backwards ? "backwards" : "forwards");
+}
+
+static ssize_t
+reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int backwards = 0;
+ int err;
+
+ if (cmd_match(buf, "forwards"))
+ backwards = 0;
+ else if (cmd_match(buf, "backwards"))
+ backwards = 1;
+ else
+ return -EINVAL;
+ if (mddev->reshape_backwards == backwards)
+ return len;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ /* check if we are allowed to change */
+ if (mddev->delta_disks)
+ err = -EBUSY;
+ else if (mddev->persistent &&
+ mddev->major_version == 0)
+ err = -EINVAL;
+ else
+ mddev->reshape_backwards = backwards;
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_reshape_direction =
+__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
+ reshape_direction_store);
+
+static ssize_t
+array_size_show(struct mddev *mddev, char *page)
+{
+ if (mddev->external_size)
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->array_sectors/2);
+ else
+ return sprintf(page, "default\n");
+}
+
+static ssize_t
+array_size_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ sector_t sectors;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ if (strncmp(buf, "default", 7) == 0) {
+ if (mddev->pers)
+ sectors = mddev->pers->size(mddev, 0, 0);
+ else
+ sectors = mddev->array_sectors;
+
+ mddev->external_size = 0;
+ } else {
+ if (strict_blocks_to_sectors(buf, &sectors) < 0)
+ err = -EINVAL;
+ else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+ err = -E2BIG;
+ else
+ mddev->external_size = 1;
+ }
+
+ if (!err) {
+ mddev->array_sectors = sectors;
+ if (mddev->pers) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_array_size =
+__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
+ array_size_store);
+
+static struct attribute *md_default_attrs[] = {
+ &md_level.attr,
+ &md_layout.attr,
+ &md_raid_disks.attr,
+ &md_chunk_size.attr,
+ &md_size.attr,
+ &md_resync_start.attr,
+ &md_metadata.attr,
+ &md_new_device.attr,
+ &md_safe_delay.attr,
+ &md_array_state.attr,
+ &md_reshape_position.attr,
+ &md_reshape_direction.attr,
+ &md_array_size.attr,
+ &max_corr_read_errors.attr,
+ NULL,
+};
+
+static struct attribute *md_redundancy_attrs[] = {
+ &md_scan_mode.attr,
+ &md_last_scan_mode.attr,
+ &md_mismatches.attr,
+ &md_sync_min.attr,
+ &md_sync_max.attr,
+ &md_sync_speed.attr,
+ &md_sync_force_parallel.attr,
+ &md_sync_completed.attr,
+ &md_min_sync.attr,
+ &md_max_sync.attr,
+ &md_suspend_lo.attr,
+ &md_suspend_hi.attr,
+ &md_bitmap.attr,
+ &md_degraded.attr,
+ NULL,
+};
+static struct attribute_group md_redundancy_group = {
+ .name = NULL,
+ .attrs = md_redundancy_attrs,
+};
+
+static ssize_t
+md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+ struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
+ struct mddev *mddev = container_of(kobj, struct mddev, kobj);
+ ssize_t rv;
+
+ if (!entry->show)
+ return -EIO;
+ spin_lock(&all_mddevs_lock);
+ if (list_empty(&mddev->all_mddevs)) {
+ spin_unlock(&all_mddevs_lock);
+ return -EBUSY;
+ }
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+
+ rv = entry->show(mddev, page);
+ mddev_put(mddev);
+ return rv;
+}
+
+static ssize_t
+md_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t length)
+{
+ struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
+ struct mddev *mddev = container_of(kobj, struct mddev, kobj);
+ ssize_t rv;
+
+ if (!entry->store)
+ return -EIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ spin_lock(&all_mddevs_lock);
+ if (list_empty(&mddev->all_mddevs)) {
+ spin_unlock(&all_mddevs_lock);
+ return -EBUSY;
+ }
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ rv = entry->store(mddev, page, length);
+ mddev_put(mddev);
+ return rv;
+}
+
+static void md_free(struct kobject *ko)
+{
+ struct mddev *mddev = container_of(ko, struct mddev, kobj);
+
+ if (mddev->sysfs_state)
+ sysfs_put(mddev->sysfs_state);
+
+ if (mddev->queue)
+ blk_cleanup_queue(mddev->queue);
+ if (mddev->gendisk) {
+ del_gendisk(mddev->gendisk);
+ put_disk(mddev->gendisk);
+ }
+
+ kfree(mddev);
+}
+
+static const struct sysfs_ops md_sysfs_ops = {
+ .show = md_attr_show,
+ .store = md_attr_store,
+};
+static struct kobj_type md_ktype = {
+ .release = md_free,
+ .sysfs_ops = &md_sysfs_ops,
+ .default_attrs = md_default_attrs,
+};
+
+int mdp_major = 0;
+
+static void mddev_delayed_delete(struct work_struct *ws)
+{
+ struct mddev *mddev = container_of(ws, struct mddev, del_work);
+
+ sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
+ kobject_del(&mddev->kobj);
+ kobject_put(&mddev->kobj);
+}
+
+static int md_alloc(dev_t dev, char *name)
+{
+ static DEFINE_MUTEX(disks_mutex);
+ struct mddev *mddev = mddev_find(dev);
+ struct gendisk *disk;
+ int partitioned;
+ int shift;
+ int unit;
+ int error;
+
+ if (!mddev)
+ return -ENODEV;
+
+ partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
+ shift = partitioned ? MdpMinorShift : 0;
+ unit = MINOR(mddev->unit) >> shift;
+
+ /* wait for any previous instance of this device to be
+ * completely removed (mddev_delayed_delete).
+ */
+ flush_workqueue(md_misc_wq);
+
+ mutex_lock(&disks_mutex);
+ error = -EEXIST;
+ if (mddev->gendisk)
+ goto abort;
+
+ if (name) {
+ /* Need to ensure that 'name' is not a duplicate.
+ */
+ struct mddev *mddev2;
+ spin_lock(&all_mddevs_lock);
+
+ list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
+ if (mddev2->gendisk &&
+ strcmp(mddev2->gendisk->disk_name, name) == 0) {
+ spin_unlock(&all_mddevs_lock);
+ goto abort;
+ }
+ spin_unlock(&all_mddevs_lock);
+ }
+
+ error = -ENOMEM;
+ mddev->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!mddev->queue)
+ goto abort;
+ mddev->queue->queuedata = mddev;
+
+ blk_queue_make_request(mddev->queue, md_make_request);
+ blk_set_stacking_limits(&mddev->queue->limits);
+
+ disk = alloc_disk(1 << shift);
+ if (!disk) {
+ blk_cleanup_queue(mddev->queue);
+ mddev->queue = NULL;
+ goto abort;
+ }
+ disk->major = MAJOR(mddev->unit);
+ disk->first_minor = unit << shift;
+ if (name)
+ strcpy(disk->disk_name, name);
+ else if (partitioned)
+ sprintf(disk->disk_name, "md_d%d", unit);
+ else
+ sprintf(disk->disk_name, "md%d", unit);
+ disk->fops = &md_fops;
+ disk->private_data = mddev;
+ disk->queue = mddev->queue;
+ blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
+ /* Allow extended partitions. This makes the
+ * 'mdp' device redundant, but we can't really
+ * remove it now.
+ */
+ disk->flags |= GENHD_FL_EXT_DEVT;
+ mddev->gendisk = disk;
+ /* As soon as we call add_disk(), another thread could get
+ * through to md_open, so make sure it doesn't get too far
+ */
+ mutex_lock(&mddev->open_mutex);
+ add_disk(disk);
+
+ error = kobject_init_and_add(&mddev->kobj, &md_ktype,
+ &disk_to_dev(disk)->kobj, "%s", "md");
+ if (error) {
+ /* This isn't possible, but as kobject_init_and_add is marked
+ * __must_check, we must do something with the result
+ */
+ printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
+ disk->disk_name);
+ error = 0;
+ }
+ if (mddev->kobj.sd &&
+ sysfs_create_group(&mddev->kobj, &md_bitmap_group))
+ printk(KERN_DEBUG "pointless warning\n");
+ mutex_unlock(&mddev->open_mutex);
+ abort:
+ mutex_unlock(&disks_mutex);
+ if (!error && mddev->kobj.sd) {
+ kobject_uevent(&mddev->kobj, KOBJ_ADD);
+ mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
+ }
+ mddev_put(mddev);
+ return error;
+}
+
+static struct kobject *md_probe(dev_t dev, int *part, void *data)
+{
+ md_alloc(dev, NULL);
+ return NULL;
+}
+
+static int add_named_array(const char *val, struct kernel_param *kp)
+{
+ /* val must be "md_*" where * is not all digits.
+ * We allocate an array with a large free minor number, and
+ * set the name to val. val must not already be an active name.
+ */
+ int len = strlen(val);
+ char buf[DISK_NAME_LEN];
+
+ while (len && val[len-1] == '\n')
+ len--;
+ if (len >= DISK_NAME_LEN)
+ return -E2BIG;
+ strlcpy(buf, val, len+1);
+ if (strncmp(buf, "md_", 3) != 0)
+ return -EINVAL;
+ return md_alloc(0, buf);
+}
+
+static void md_safemode_timeout(unsigned long data)
+{
+ struct mddev *mddev = (struct mddev *) data;
+
+ if (!atomic_read(&mddev->writes_pending)) {
+ mddev->safemode = 1;
+ if (mddev->external)
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ }
+ md_wakeup_thread(mddev->thread);
+}
+
+static int start_dirty_degraded;
+
+int md_run(struct mddev *mddev)
+{
+ int err;
+ struct md_rdev *rdev;
+ struct md_personality *pers;
+
+ if (list_empty(&mddev->disks))
+ /* cannot run an array with no devices.. */
+ return -EINVAL;
+
+ if (mddev->pers)
+ return -EBUSY;
+ /* Cannot run until previous stop completes properly */
+ if (mddev->sysfs_active)
+ return -EBUSY;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (!mddev->raid_disks) {
+ if (!mddev->persistent)
+ return -EINVAL;
+ analyze_sbs(mddev);
+ }
+
+ if (mddev->level != LEVEL_NONE)
+ request_module("md-level-%d", mddev->level);
+ else if (mddev->clevel[0])
+ request_module("md-%s", mddev->clevel);
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ */
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ sync_blockdev(rdev->bdev);
+ invalidate_bdev(rdev->bdev);
+
+ /* perform some consistency tests on the device.
+ * We don't want the data to overlap the metadata,
+ * Internal Bitmap issues have been handled elsewhere.
+ */
+ if (rdev->meta_bdev) {
+ /* Nothing to check */;
+ } else if (rdev->data_offset < rdev->sb_start) {
+ if (mddev->dev_sectors &&
+ rdev->data_offset + mddev->dev_sectors
+ > rdev->sb_start) {
+ printk("md: %s: data overlaps metadata\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ } else {
+ if (rdev->sb_start + rdev->sb_size/512
+ > rdev->data_offset) {
+ printk("md: %s: metadata overlaps data\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ }
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ }
+
+ if (mddev->bio_set == NULL)
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
+
+ spin_lock(&pers_lock);
+ pers = find_pers(mddev->level, mddev->clevel);
+ if (!pers || !try_module_get(pers->owner)) {
+ spin_unlock(&pers_lock);
+ if (mddev->level != LEVEL_NONE)
+ printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
+ mddev->level);
+ else
+ printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
+ mddev->clevel);
+ return -EINVAL;
+ }
+ spin_unlock(&pers_lock);
+ if (mddev->level != pers->level) {
+ mddev->level = pers->level;
+ mddev->new_level = pers->level;
+ }
+ strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+
+ if (mddev->reshape_position != MaxSector &&
+ pers->start_reshape == NULL) {
+ /* This personality cannot handle reshaping... */
+ module_put(pers->owner);
+ return -EINVAL;
+ }
+
+ if (pers->sync_request) {
+ /* Warn if this is a potentially silly
+ * configuration.
+ */
+ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ struct md_rdev *rdev2;
+ int warned = 0;
+
+ rdev_for_each(rdev, mddev)
+ rdev_for_each(rdev2, mddev) {
+ if (rdev < rdev2 &&
+ rdev->bdev->bd_contains ==
+ rdev2->bdev->bd_contains) {
+ printk(KERN_WARNING
+ "%s: WARNING: %s appears to be"
+ " on the same physical disk as"
+ " %s.\n",
+ mdname(mddev),
+ bdevname(rdev->bdev,b),
+ bdevname(rdev2->bdev,b2));
+ warned = 1;
+ }
+ }
+
+ if (warned)
+ printk(KERN_WARNING
+ "True protection against single-disk"
+ " failure might be compromised.\n");
+ }
+
+ mddev->recovery = 0;
+ /* may be over-ridden by personality */
+ mddev->resync_max_sectors = mddev->dev_sectors;
+
+ mddev->ok_start_degraded = start_dirty_degraded;
+
+ if (start_readonly && mddev->ro == 0)
+ mddev->ro = 2; /* read-only, but switch on first write */
+
+ err = pers->run(mddev);
+ if (err)
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
+ WARN_ONCE(!mddev->external_size, "%s: default size too small,"
+ " but 'external_size' not in effect?\n", __func__);
+ printk(KERN_ERR
+ "md: invalid array_size %llu > default size %llu\n",
+ (unsigned long long)mddev->array_sectors / 2,
+ (unsigned long long)pers->size(mddev, 0, 0) / 2);
+ err = -EINVAL;
+ }
+ if (err == 0 && pers->sync_request &&
+ (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
+ struct bitmap *bitmap;
+
+ bitmap = bitmap_create(mddev, -1);
+ if (IS_ERR(bitmap)) {
+ err = PTR_ERR(bitmap);
+ printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
+ mdname(mddev), err);
+ } else
+ mddev->bitmap = bitmap;
+
+ }
+ if (err) {
+ mddev_detach(mddev);
+ if (mddev->private)
+ pers->free(mddev, mddev->private);
+ mddev->private = NULL;
+ module_put(pers->owner);
+ bitmap_destroy(mddev);
+ return err;
+ }
+ if (mddev->queue) {
+ mddev->queue->backing_dev_info.congested_data = mddev;
+ mddev->queue->backing_dev_info.congested_fn = md_congested;
+ blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
+ }
+ if (pers->sync_request) {
+ if (mddev->kobj.sd &&
+ sysfs_create_group(&mddev->kobj, &md_redundancy_group))
+ printk(KERN_WARNING
+ "md: cannot register extra attributes for %s\n",
+ mdname(mddev));
+ mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
+ } else if (mddev->ro == 2) /* auto-readonly not meaningful */
+ mddev->ro = 0;
+
+ atomic_set(&mddev->writes_pending,0);
+ atomic_set(&mddev->max_corr_read_errors,
+ MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
+ mddev->safemode = 0;
+ mddev->safemode_timer.function = md_safemode_timeout;
+ mddev->safemode_timer.data = (unsigned long) mddev;
+ mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
+ mddev->in_sync = 1;
+ smp_wmb();
+ spin_lock(&mddev->lock);
+ mddev->pers = pers;
+ mddev->ready = 1;
+ spin_unlock(&mddev->lock);
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0)
+ if (sysfs_link_rdev(mddev, rdev))
+ /* failure here is OK */;
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+
+ if (mddev->flags & MD_UPDATE_SB_FLAGS)
+ md_update_sb(mddev, 0);
+
+ md_new_event(mddev);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
+ return 0;
+}
+EXPORT_SYMBOL_GPL(md_run);
+
+static int do_md_run(struct mddev *mddev)
+{
+ int err;
+
+ err = md_run(mddev);
+ if (err)
+ goto out;
+ err = bitmap_load(mddev);
+ if (err) {
+ bitmap_destroy(mddev);
+ goto out;
+ }
+
+ md_wakeup_thread(mddev->thread);
+ md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ mddev->changed = 1;
+ kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+out:
+ return err;
+}
+
+static int restart_array(struct mddev *mddev)
+{
+ struct gendisk *disk = mddev->gendisk;
+
+ /* Complain if it has no devices */
+ if (list_empty(&mddev->disks))
+ return -ENXIO;
+ if (!mddev->pers)
+ return -EINVAL;
+ if (!mddev->ro)
+ return -EBUSY;
+ mddev->safemode = 0;
+ mddev->ro = 0;
+ set_disk_ro(disk, 0);
+ printk(KERN_INFO "md: %s switched to read-write mode.\n",
+ mdname(mddev));
+ /* Kick recovery or resync if necessary */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ md_wakeup_thread(mddev->sync_thread);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ return 0;
+}
+
+static void md_clean(struct mddev *mddev)
+{
+ mddev->array_sectors = 0;
+ mddev->external_size = 0;
+ mddev->dev_sectors = 0;
+ mddev->raid_disks = 0;
+ mddev->recovery_cp = 0;
+ mddev->resync_min = 0;
+ mddev->resync_max = MaxSector;
+ mddev->reshape_position = MaxSector;
+ mddev->external = 0;
+ mddev->persistent = 0;
+ mddev->level = LEVEL_NONE;
+ mddev->clevel[0] = 0;
+ mddev->flags = 0;
+ mddev->ro = 0;
+ mddev->metadata_type[0] = 0;
+ mddev->chunk_sectors = 0;
+ mddev->ctime = mddev->utime = 0;
+ mddev->layout = 0;
+ mddev->max_disks = 0;
+ mddev->events = 0;
+ mddev->can_decrease_events = 0;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ mddev->new_level = LEVEL_NONE;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = 0;
+ mddev->curr_resync = 0;
+ atomic64_set(&mddev->resync_mismatches, 0);
+ mddev->suspend_lo = mddev->suspend_hi = 0;
+ mddev->sync_speed_min = mddev->sync_speed_max = 0;
+ mddev->recovery = 0;
+ mddev->in_sync = 0;
+ mddev->changed = 0;
+ mddev->degraded = 0;
+ mddev->safemode = 0;
+ mddev->private = NULL;
+ mddev->merge_check_needed = 0;
+ mddev->bitmap_info.offset = 0;
+ mddev->bitmap_info.default_offset = 0;
+ mddev->bitmap_info.default_space = 0;
+ mddev->bitmap_info.chunksize = 0;
+ mddev->bitmap_info.daemon_sleep = 0;
+ mddev->bitmap_info.max_write_behind = 0;
+}
+
+static void __md_stop_writes(struct mddev *mddev)
+{
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ flush_workqueue(md_misc_wq);
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_reap_sync_thread(mddev);
+ }
+
+ del_timer_sync(&mddev->safemode_timer);
+
+ bitmap_flush(mddev);
+ md_super_wait(mddev);
+
+ if (mddev->ro == 0 &&
+ (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
+ /* mark array as shutdown cleanly */
+ mddev->in_sync = 1;
+ md_update_sb(mddev, 1);
+ }
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+}
+
+void md_stop_writes(struct mddev *mddev)
+{
+ mddev_lock_nointr(mddev);
+ __md_stop_writes(mddev);
+ mddev_unlock(mddev);
+}
+EXPORT_SYMBOL_GPL(md_stop_writes);
+
+static void mddev_detach(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+ if (mddev->pers && mddev->pers->quiesce) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ md_unregister_thread(&mddev->thread);
+ if (mddev->queue)
+ blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+}
+
+static void __md_stop(struct mddev *mddev)
+{
+ struct md_personality *pers = mddev->pers;
+ mddev_detach(mddev);
+ spin_lock(&mddev->lock);
+ mddev->ready = 0;
+ mddev->pers = NULL;
+ spin_unlock(&mddev->lock);
+ pers->free(mddev, mddev->private);
+ mddev->private = NULL;
+ if (pers->sync_request && mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+ module_put(pers->owner);
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+}
+
+void md_stop(struct mddev *mddev)
+{
+ /* stop the array and free an attached data structures.
+ * This is called from dm-raid
+ */
+ __md_stop(mddev);
+ bitmap_destroy(mddev);
+ if (mddev->bio_set)
+ bioset_free(mddev->bio_set);
+}
+
+EXPORT_SYMBOL_GPL(md_stop);
+
+static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
+{
+ int err = 0;
+ int did_freeze = 0;
+
+ if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+ did_freeze = 1;
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ if (mddev->sync_thread)
+ /* Thread might be blocked waiting for metadata update
+ * which will now never happen */
+ wake_up_process(mddev->sync_thread->tsk);
+
+ mddev_unlock(mddev);
+ wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
+ &mddev->recovery));
+ mddev_lock_nointr(mddev);
+
+ mutex_lock(&mddev->open_mutex);
+ if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
+ mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
+ printk("md: %s still in use.\n",mdname(mddev));
+ if (did_freeze) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+ err = -EBUSY;
+ goto out;
+ }
+ if (mddev->pers) {
+ __md_stop_writes(mddev);
+
+ err = -ENXIO;
+ if (mddev->ro==1)
+ goto out;
+ mddev->ro = 1;
+ set_disk_ro(mddev->gendisk, 1);
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ err = 0;
+ }
+out:
+ mutex_unlock(&mddev->open_mutex);
+ return err;
+}
+
+/* mode:
+ * 0 - completely stop and dis-assemble array
+ * 2 - stop but do not disassemble array
+ */
+static int do_md_stop(struct mddev *mddev, int mode,
+ struct block_device *bdev)
+{
+ struct gendisk *disk = mddev->gendisk;
+ struct md_rdev *rdev;
+ int did_freeze = 0;
+
+ if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+ did_freeze = 1;
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ if (mddev->sync_thread)
+ /* Thread might be blocked waiting for metadata update
+ * which will now never happen */
+ wake_up_process(mddev->sync_thread->tsk);
+
+ mddev_unlock(mddev);
+ wait_event(resync_wait, (mddev->sync_thread == NULL &&
+ !test_bit(MD_RECOVERY_RUNNING,
+ &mddev->recovery)));
+ mddev_lock_nointr(mddev);
+
+ mutex_lock(&mddev->open_mutex);
+ if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
+ mddev->sysfs_active ||
+ mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
+ printk("md: %s still in use.\n",mdname(mddev));
+ mutex_unlock(&mddev->open_mutex);
+ if (did_freeze) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+ return -EBUSY;
+ }
+ if (mddev->pers) {
+ if (mddev->ro)
+ set_disk_ro(disk, 0);
+
+ __md_stop_writes(mddev);
+ __md_stop(mddev);
+ mddev->queue->merge_bvec_fn = NULL;
+ mddev->queue->backing_dev_info.congested_fn = NULL;
+
+ /* tell userspace to handle 'inactive' */
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0)
+ sysfs_unlink_rdev(mddev, rdev);
+
+ set_capacity(disk, 0);
+ mutex_unlock(&mddev->open_mutex);
+ mddev->changed = 1;
+ revalidate_disk(disk);
+
+ if (mddev->ro)
+ mddev->ro = 0;
+ } else
+ mutex_unlock(&mddev->open_mutex);
+ /*
+ * Free resources if final stop
+ */
+ if (mode == 0) {
+ printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
+
+ bitmap_destroy(mddev);
+ if (mddev->bitmap_info.file) {
+ struct file *f = mddev->bitmap_info.file;
+ spin_lock(&mddev->lock);
+ mddev->bitmap_info.file = NULL;
+ spin_unlock(&mddev->lock);
+ fput(f);
+ }
+ mddev->bitmap_info.offset = 0;
+
+ export_array(mddev);
+
+ md_clean(mddev);
+ kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+ if (mddev->hold_active == UNTIL_STOP)
+ mddev->hold_active = 0;
+ }
+ blk_integrity_unregister(disk);
+ md_new_event(mddev);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ return 0;
+}
+
+#ifndef MODULE
+static void autorun_array(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ int err;
+
+ if (list_empty(&mddev->disks))
+ return;
+
+ printk(KERN_INFO "md: running: ");
+
+ rdev_for_each(rdev, mddev) {
+ char b[BDEVNAME_SIZE];
+ printk("<%s>", bdevname(rdev->bdev,b));
+ }
+ printk("\n");
+
+ err = do_md_run(mddev);
+ if (err) {
+ printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
+ do_md_stop(mddev, 0, NULL);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in pending_raid_disks)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(int part)
+{
+ struct md_rdev *rdev0, *rdev, *tmp;
+ struct mddev *mddev;
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ int unit;
+ dev_t dev;
+ LIST_HEAD(candidates);
+ rdev0 = list_entry(pending_raid_disks.next,
+ struct md_rdev, same_set);
+
+ printk(KERN_INFO "md: considering %s ...\n",
+ bdevname(rdev0->bdev,b));
+ INIT_LIST_HEAD(&candidates);
+ rdev_for_each_list(rdev, tmp, &pending_raid_disks)
+ if (super_90_load(rdev, rdev0, 0) >= 0) {
+ printk(KERN_INFO "md: adding %s ...\n",
+ bdevname(rdev->bdev,b));
+ list_move(&rdev->same_set, &candidates);
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+ if (part) {
+ dev = MKDEV(mdp_major,
+ rdev0->preferred_minor << MdpMinorShift);
+ unit = MINOR(dev) >> MdpMinorShift;
+ } else {
+ dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+ unit = MINOR(dev);
+ }
+ if (rdev0->preferred_minor != unit) {
+ printk(KERN_INFO "md: unit number in %s is bad: %d\n",
+ bdevname(rdev0->bdev, b), rdev0->preferred_minor);
+ break;
+ }
+
+ md_probe(dev, NULL, NULL);
+ mddev = mddev_find(dev);
+ if (!mddev || !mddev->gendisk) {
+ if (mddev)
+ mddev_put(mddev);
+ printk(KERN_ERR
+ "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (mddev_lock(mddev))
+ printk(KERN_WARNING "md: %s locked, cannot run\n",
+ mdname(mddev));
+ else if (mddev->raid_disks || mddev->major_version
+ || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: %s already running, cannot run %s\n",
+ mdname(mddev), bdevname(rdev0->bdev,b));
+ mddev_unlock(mddev);
+ } else {
+ printk(KERN_INFO "md: created %s\n", mdname(mddev));
+ mddev->persistent = 1;
+ rdev_for_each_list(rdev, tmp, &candidates) {
+ list_del_init(&rdev->same_set);
+ if (bind_rdev_to_array(rdev, mddev))
+ export_rdev(rdev);
+ }
+ autorun_array(mddev);
+ mddev_unlock(mddev);
+ }
+ /* on success, candidates will be empty, on error
+ * it won't...
+ */
+ rdev_for_each_list(rdev, tmp, &candidates) {
+ list_del_init(&rdev->same_set);
+ export_rdev(rdev);
+ }
+ mddev_put(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+#endif /* !MODULE */
+
+static int get_version(void __user *arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_array_info(struct mddev *mddev, void __user *arg)
+{
+ mdu_array_info_t info;
+ int nr,working,insync,failed,spare;
+ struct md_rdev *rdev;
+
+ nr = working = insync = failed = spare = 0;
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ nr++;
+ if (test_bit(Faulty, &rdev->flags))
+ failed++;
+ else {
+ working++;
+ if (test_bit(In_sync, &rdev->flags))
+ insync++;
+ else
+ spare++;
+ }
+ }
+ rcu_read_unlock();
+
+ info.major_version = mddev->major_version;
+ info.minor_version = mddev->minor_version;
+ info.patch_version = MD_PATCHLEVEL_VERSION;
+ info.ctime = mddev->ctime;
+ info.level = mddev->level;
+ info.size = mddev->dev_sectors / 2;
+ if (info.size != mddev->dev_sectors / 2) /* overflow */
+ info.size = -1;
+ info.nr_disks = nr;
+ info.raid_disks = mddev->raid_disks;
+ info.md_minor = mddev->md_minor;
+ info.not_persistent= !mddev->persistent;
+
+ info.utime = mddev->utime;
+ info.state = 0;
+ if (mddev->in_sync)
+ info.state = (1<<MD_SB_CLEAN);
+ if (mddev->bitmap && mddev->bitmap_info.offset)
+ info.state |= (1<<MD_SB_BITMAP_PRESENT);
+ if (mddev_is_clustered(mddev))
+ info.state |= (1<<MD_SB_CLUSTERED);
+ info.active_disks = insync;
+ info.working_disks = working;
+ info.failed_disks = failed;
+ info.spare_disks = spare;
+
+ info.layout = mddev->layout;
+ info.chunk_size = mddev->chunk_sectors << 9;
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_bitmap_file(struct mddev *mddev, void __user * arg)
+{
+ mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
+ char *ptr;
+ int err;
+
+ file = kmalloc(sizeof(*file), GFP_NOIO);
+ if (!file)
+ return -ENOMEM;
+
+ err = 0;
+ spin_lock(&mddev->lock);
+ /* bitmap disabled, zero the first byte and copy out */
+ if (!mddev->bitmap_info.file)
+ file->pathname[0] = '\0';
+ else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+ file->pathname, sizeof(file->pathname))),
+ IS_ERR(ptr))
+ err = PTR_ERR(ptr);
+ else
+ memmove(file->pathname, ptr,
+ sizeof(file->pathname)-(ptr-file->pathname));
+ spin_unlock(&mddev->lock);
+
+ if (err == 0 &&
+ copy_to_user(arg, file, sizeof(*file)))
+ err = -EFAULT;
+
+ kfree(file);
+ return err;
+}
+
+static int get_disk_info(struct mddev *mddev, void __user * arg)
+{
+ mdu_disk_info_t info;
+ struct md_rdev *rdev;
+
+ if (copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ rdev = md_find_rdev_nr_rcu(mddev, info.number);
+ if (rdev) {
+ info.major = MAJOR(rdev->bdev->bd_dev);
+ info.minor = MINOR(rdev->bdev->bd_dev);
+ info.raid_disk = rdev->raid_disk;
+ info.state = 0;
+ if (test_bit(Faulty, &rdev->flags))
+ info.state |= (1<<MD_DISK_FAULTY);
+ else if (test_bit(In_sync, &rdev->flags)) {
+ info.state |= (1<<MD_DISK_ACTIVE);
+ info.state |= (1<<MD_DISK_SYNC);
+ }
+ if (test_bit(WriteMostly, &rdev->flags))
+ info.state |= (1<<MD_DISK_WRITEMOSTLY);
+ } else {
+ info.major = info.minor = 0;
+ info.raid_disk = -1;
+ info.state = (1<<MD_DISK_REMOVED);
+ }
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
+{
+ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ struct md_rdev *rdev;
+ dev_t dev = MKDEV(info->major,info->minor);
+
+ if (mddev_is_clustered(mddev) &&
+ !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
+ pr_err("%s: Cannot add to clustered mddev.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
+ return -EOVERFLOW;
+
+ if (!mddev->raid_disks) {
+ int err;
+ /* expecting a device which has a superblock */
+ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ if (!list_empty(&mddev->disks)) {
+ struct md_rdev *rdev0
+ = list_entry(mddev->disks.next,
+ struct md_rdev, same_set);
+ err = super_types[mddev->major_version]
+ .load_super(rdev, rdev0, mddev->minor_version);
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: %s has different UUID to %s\n",
+ bdevname(rdev->bdev,b),
+ bdevname(rdev0->bdev,b2));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ return err;
+ }
+
+ /*
+ * add_new_disk can be used once the array is assembled
+ * to add "hot spares". They must already have a superblock
+ * written
+ */
+ if (mddev->pers) {
+ int err;
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "%s: personality does not support diskops!\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ if (mddev->persistent)
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ else
+ rdev = md_import_device(dev, -1, -1);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ /* set saved_raid_disk if appropriate */
+ if (!mddev->persistent) {
+ if (info->state & (1<<MD_DISK_SYNC) &&
+ info->raid_disk < mddev->raid_disks) {
+ rdev->raid_disk = info->raid_disk;
+ set_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
+ } else
+ rdev->raid_disk = -1;
+ rdev->saved_raid_disk = rdev->raid_disk;
+ } else
+ super_types[mddev->major_version].
+ validate_super(mddev, rdev);
+ if ((info->state & (1<<MD_DISK_SYNC)) &&
+ rdev->raid_disk != info->raid_disk) {
+ /* This was a hot-add request, but events doesn't
+ * match, so reject it.
+ */
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+
+ clear_bit(In_sync, &rdev->flags); /* just to be sure */
+ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+ else
+ clear_bit(WriteMostly, &rdev->flags);
+
+ /*
+ * check whether the device shows up in other nodes
+ */
+ if (mddev_is_clustered(mddev)) {
+ if (info->state & (1 << MD_DISK_CANDIDATE)) {
+ /* Through --cluster-confirm */
+ set_bit(Candidate, &rdev->flags);
+ err = md_cluster_ops->new_disk_ack(mddev, true);
+ if (err) {
+ export_rdev(rdev);
+ return err;
+ }
+ } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
+ /* --add initiated by this node */
+ err = md_cluster_ops->add_new_disk_start(mddev, rdev);
+ if (err) {
+ md_cluster_ops->add_new_disk_finish(mddev);
+ export_rdev(rdev);
+ return err;
+ }
+ }
+ }
+
+ rdev->raid_disk = -1;
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ else
+ err = add_bound_rdev(rdev);
+ if (mddev_is_clustered(mddev) &&
+ (info->state & (1 << MD_DISK_CLUSTER_ADD)))
+ md_cluster_ops->add_new_disk_finish(mddev);
+ return err;
+ }
+
+ /* otherwise, add_new_disk is only allowed
+ * for major_version==0 superblocks
+ */
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ if (!(info->state & (1<<MD_DISK_FAULTY))) {
+ int err;
+ rdev = md_import_device(dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->desc_nr = info->number;
+ if (info->raid_disk < mddev->raid_disks)
+ rdev->raid_disk = info->raid_disk;
+ else
+ rdev->raid_disk = -1;
+
+ if (rdev->raid_disk < mddev->raid_disks)
+ if (info->state & (1<<MD_DISK_SYNC))
+ set_bit(In_sync, &rdev->flags);
+
+ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+
+ if (!mddev->persistent) {
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+ rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
+ } else
+ rdev->sb_start = calc_dev_sboffset(rdev);
+ rdev->sectors = rdev->sb_start;
+
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err) {
+ export_rdev(rdev);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int hot_remove_disk(struct mddev *mddev, dev_t dev)
+{
+ char b[BDEVNAME_SIZE];
+ struct md_rdev *rdev;
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+
+ clear_bit(Blocked, &rdev->flags);
+ remove_and_add_spares(mddev, rdev);
+
+ if (rdev->raid_disk >= 0)
+ goto busy;
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->remove_disk(mddev, rdev);
+
+ md_kick_rdev_from_array(rdev);
+ md_update_sb(mddev, 1);
+ md_new_event(mddev);
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+
+ return 0;
+busy:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_cancel(mddev);
+ printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
+ bdevname(rdev->bdev,b), mdname(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(struct mddev *mddev, dev_t dev)
+{
+ char b[BDEVNAME_SIZE];
+ int err;
+ struct md_rdev *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "%s: HOT_ADD may only be used with"
+ " version-0 superblocks.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "%s: personality does not support diskops!\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ rdev = md_import_device(dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return -EINVAL;
+ }
+
+ if (mddev->persistent)
+ rdev->sb_start = calc_dev_sboffset(rdev);
+ else
+ rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
+
+ rdev->sectors = rdev->sb_start;
+
+ if (test_bit(Faulty, &rdev->flags)) {
+ printk(KERN_WARNING
+ "md: can not hot-add faulty %s disk to %s!\n",
+ bdevname(rdev->bdev,b), mdname(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+ clear_bit(In_sync, &rdev->flags);
+ rdev->desc_nr = -1;
+ rdev->saved_raid_disk = -1;
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ goto abort_clustered;
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+
+ rdev->raid_disk = -1;
+
+ md_update_sb(mddev, 1);
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ md_new_event(mddev);
+ return 0;
+
+abort_clustered:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_cancel(mddev);
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+static int set_bitmap_file(struct mddev *mddev, int fd)
+{
+ int err = 0;
+
+ if (mddev->pers) {
+ if (!mddev->pers->quiesce || !mddev->thread)
+ return -EBUSY;
+ if (mddev->recovery || mddev->sync_thread)
+ return -EBUSY;
+ /* we should be able to change the bitmap.. */
+ }
+
+ if (fd >= 0) {
+ struct inode *inode;
+ struct file *f;
+
+ if (mddev->bitmap || mddev->bitmap_info.file)
+ return -EEXIST; /* cannot add when bitmap is present */
+ f = fget(fd);
+
+ if (f == NULL) {
+ printk(KERN_ERR "%s: error: failed to get bitmap file\n",
+ mdname(mddev));
+ return -EBADF;
+ }
+
+ inode = f->f_mapping->host;
+ if (!S_ISREG(inode->i_mode)) {
+ printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
+ mdname(mddev));
+ err = -EBADF;
+ } else if (!(f->f_mode & FMODE_WRITE)) {
+ printk(KERN_ERR "%s: error: bitmap file must open for write\n",
+ mdname(mddev));
+ err = -EBADF;
+ } else if (atomic_read(&inode->i_writecount) != 1) {
+ printk(KERN_ERR "%s: error: bitmap file is already in use\n",
+ mdname(mddev));
+ err = -EBUSY;
+ }
+ if (err) {
+ fput(f);
+ return err;
+ }
+ mddev->bitmap_info.file = f;
+ mddev->bitmap_info.offset = 0; /* file overrides offset */
+ } else if (mddev->bitmap == NULL)
+ return -ENOENT; /* cannot remove what isn't there */
+ err = 0;
+ if (mddev->pers) {
+ mddev->pers->quiesce(mddev, 1);
+ if (fd >= 0) {
+ struct bitmap *bitmap;
+
+ bitmap = bitmap_create(mddev, -1);
+ if (!IS_ERR(bitmap)) {
+ mddev->bitmap = bitmap;
+ err = bitmap_load(mddev);
+ } else
+ err = PTR_ERR(bitmap);
+ }
+ if (fd < 0 || err) {
+ bitmap_destroy(mddev);
+ fd = -1; /* make sure to put the file */
+ }
+ mddev->pers->quiesce(mddev, 0);
+ }
+ if (fd < 0) {
+ struct file *f = mddev->bitmap_info.file;
+ if (f) {
+ spin_lock(&mddev->lock);
+ mddev->bitmap_info.file = NULL;
+ spin_unlock(&mddev->lock);
+ fput(f);
+ }
+ }
+
+ return err;
+}
+
+/*
+ * set_array_info is used two different ways
+ * The original usage is when creating a new array.
+ * In this usage, raid_disks is > 0 and it together with
+ * level, size, not_persistent,layout,chunksize determine the
+ * shape of the array.
+ * This will always create an array with a type-0.90.0 superblock.
+ * The newer usage is when assembling an array.
+ * In this case raid_disks will be 0, and the major_version field is
+ * use to determine which style super-blocks are to be found on the devices.
+ * The minor and patch _version numbers are also kept incase the
+ * super_block handler wishes to interpret them.
+ */
+static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
+{
+
+ if (info->raid_disks == 0) {
+ /* just setting version number for superblock loading */
+ if (info->major_version < 0 ||
+ info->major_version >= ARRAY_SIZE(super_types) ||
+ super_types[info->major_version].name == NULL) {
+ /* maybe try to auto-load a module? */
+ printk(KERN_INFO
+ "md: superblock version %d not known\n",
+ info->major_version);
+ return -EINVAL;
+ }
+ mddev->major_version = info->major_version;
+ mddev->minor_version = info->minor_version;
+ mddev->patch_version = info->patch_version;
+ mddev->persistent = !info->not_persistent;
+ /* ensure mddev_put doesn't delete this now that there
+ * is some minimal configuration.
+ */
+ mddev->ctime = get_seconds();
+ return 0;
+ }
+ mddev->major_version = MD_MAJOR_VERSION;
+ mddev->minor_version = MD_MINOR_VERSION;
+ mddev->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->ctime = get_seconds();
+
+ mddev->level = info->level;
+ mddev->clevel[0] = 0;
+ mddev->dev_sectors = 2 * (sector_t)info->size;
+ mddev->raid_disks = info->raid_disks;
+ /* don't set md_minor, it is determined by which /dev/md* was
+ * openned
+ */
+ if (info->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else
+ mddev->recovery_cp = 0;
+ mddev->persistent = ! info->not_persistent;
+ mddev->external = 0;
+
+ mddev->layout = info->layout;
+ mddev->chunk_sectors = info->chunk_size >> 9;
+
+ mddev->max_disks = MD_SB_DISKS;
+
+ if (mddev->persistent)
+ mddev->flags = 0;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+
+ mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
+ mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
+ mddev->bitmap_info.offset = 0;
+
+ mddev->reshape_position = MaxSector;
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(mddev->uuid, 16);
+
+ mddev->new_level = mddev->level;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->new_layout = mddev->layout;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+
+ return 0;
+}
+
+void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
+{
+ WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
+
+ if (mddev->external_size)
+ return;
+
+ mddev->array_sectors = array_sectors;
+}
+EXPORT_SYMBOL(md_set_array_sectors);
+
+static int update_size(struct mddev *mddev, sector_t num_sectors)
+{
+ struct md_rdev *rdev;
+ int rv;
+ int fit = (num_sectors == 0);
+
+ if (mddev->pers->resize == NULL)
+ return -EINVAL;
+ /* The "num_sectors" is the number of sectors of each device that
+ * is used. This can only make sense for arrays with redundancy.
+ * linear and raid0 always use whatever space is available. We can only
+ * consider changing this number if no resync or reconstruction is
+ * happening, and if the new size is acceptable. It must fit before the
+ * sb_start or, if that is <data_offset, it must fit before the size
+ * of each device. If num_sectors is zero, we find the largest size
+ * that fits.
+ */
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ mddev->sync_thread)
+ return -EBUSY;
+ if (mddev->ro)
+ return -EROFS;
+
+ rdev_for_each(rdev, mddev) {
+ sector_t avail = rdev->sectors;
+
+ if (fit && (num_sectors == 0 || num_sectors > avail))
+ num_sectors = avail;
+ if (avail < num_sectors)
+ return -ENOSPC;
+ }
+ rv = mddev->pers->resize(mddev, num_sectors);
+ if (!rv)
+ revalidate_disk(mddev->gendisk);
+ return rv;
+}
+
+static int update_raid_disks(struct mddev *mddev, int raid_disks)
+{
+ int rv;
+ struct md_rdev *rdev;
+ /* change the number of raid disks */
+ if (mddev->pers->check_reshape == NULL)
+ return -EINVAL;
+ if (mddev->ro)
+ return -EROFS;
+ if (raid_disks <= 0 ||
+ (mddev->max_disks && raid_disks >= mddev->max_disks))
+ return -EINVAL;
+ if (mddev->sync_thread ||
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ mddev->reshape_position != MaxSector)
+ return -EBUSY;
+
+ rdev_for_each(rdev, mddev) {
+ if (mddev->raid_disks < raid_disks &&
+ rdev->data_offset < rdev->new_data_offset)
+ return -EINVAL;
+ if (mddev->raid_disks > raid_disks &&
+ rdev->data_offset > rdev->new_data_offset)
+ return -EINVAL;
+ }
+
+ mddev->delta_disks = raid_disks - mddev->raid_disks;
+ if (mddev->delta_disks < 0)
+ mddev->reshape_backwards = 1;
+ else if (mddev->delta_disks > 0)
+ mddev->reshape_backwards = 0;
+
+ rv = mddev->pers->check_reshape(mddev);
+ if (rv < 0) {
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ }
+ return rv;
+}
+
+/*
+ * update_array_info is used to change the configuration of an
+ * on-line array.
+ * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
+ * fields in the info are checked against the array.
+ * Any differences that cannot be handled will cause an error.
+ * Normally, only one change can be managed at a time.
+ */
+static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
+{
+ int rv = 0;
+ int cnt = 0;
+ int state = 0;
+
+ /* calculate expected state,ignoring low bits */
+ if (mddev->bitmap && mddev->bitmap_info.offset)
+ state |= (1 << MD_SB_BITMAP_PRESENT);
+
+ if (mddev->major_version != info->major_version ||
+ mddev->minor_version != info->minor_version ||
+/* mddev->patch_version != info->patch_version || */
+ mddev->ctime != info->ctime ||
+ mddev->level != info->level ||
+/* mddev->layout != info->layout || */
+ mddev->persistent != !info->not_persistent ||
+ mddev->chunk_sectors != info->chunk_size >> 9 ||
+ /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
+ ((state^info->state) & 0xfffffe00)
+ )
+ return -EINVAL;
+ /* Check there is only one change */
+ if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
+ cnt++;
+ if (mddev->raid_disks != info->raid_disks)
+ cnt++;
+ if (mddev->layout != info->layout)
+ cnt++;
+ if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
+ cnt++;
+ if (cnt == 0)
+ return 0;
+ if (cnt > 1)
+ return -EINVAL;
+
+ if (mddev->layout != info->layout) {
+ /* Change layout
+ * we don't need to do anything at the md level, the
+ * personality will take care of it all.
+ */
+ if (mddev->pers->check_reshape == NULL)
+ return -EINVAL;
+ else {
+ mddev->new_layout = info->layout;
+ rv = mddev->pers->check_reshape(mddev);
+ if (rv)
+ mddev->new_layout = mddev->layout;
+ return rv;
+ }
+ }
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+ if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
+ rv = update_size(mddev, (sector_t)info->size * 2);
+
+ if (mddev->raid_disks != info->raid_disks)
+ rv = update_raid_disks(mddev, info->raid_disks);
+
+ if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
+ if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
+ rv = -EINVAL;
+ goto err;
+ }
+ if (mddev->recovery || mddev->sync_thread) {
+ rv = -EBUSY;
+ goto err;
+ }
+ if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
+ struct bitmap *bitmap;
+ /* add the bitmap */
+ if (mddev->bitmap) {
+ rv = -EEXIST;
+ goto err;
+ }
+ if (mddev->bitmap_info.default_offset == 0) {
+ rv = -EINVAL;
+ goto err;
+ }
+ mddev->bitmap_info.offset =
+ mddev->bitmap_info.default_offset;
+ mddev->bitmap_info.space =
+ mddev->bitmap_info.default_space;
+ mddev->pers->quiesce(mddev, 1);
+ bitmap = bitmap_create(mddev, -1);
+ if (!IS_ERR(bitmap)) {
+ mddev->bitmap = bitmap;
+ rv = bitmap_load(mddev);
+ } else
+ rv = PTR_ERR(bitmap);
+ if (rv)
+ bitmap_destroy(mddev);
+ mddev->pers->quiesce(mddev, 0);
+ } else {
+ /* remove the bitmap */
+ if (!mddev->bitmap) {
+ rv = -ENOENT;
+ goto err;
+ }
+ if (mddev->bitmap->storage.file) {
+ rv = -EINVAL;
+ goto err;
+ }
+ mddev->pers->quiesce(mddev, 1);
+ bitmap_destroy(mddev);
+ mddev->pers->quiesce(mddev, 0);
+ mddev->bitmap_info.offset = 0;
+ }
+ }
+ md_update_sb(mddev, 1);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ return rv;
+err:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_cancel(mddev);
+ return rv;
+}
+
+static int set_disk_faulty(struct mddev *mddev, dev_t dev)
+{
+ struct md_rdev *rdev;
+ int err = 0;
+
+ if (mddev->pers == NULL)
+ return -ENODEV;
+
+ rcu_read_lock();
+ rdev = find_rdev_rcu(mddev, dev);
+ if (!rdev)
+ err = -ENODEV;
+ else {
+ md_error(mddev, rdev);
+ if (!test_bit(Faulty, &rdev->flags))
+ err = -EBUSY;
+ }
+ rcu_read_unlock();
+ return err;
+}
+
+/*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct mddev *mddev = bdev->bd_disk->private_data;
+
+ geo->heads = 2;
+ geo->sectors = 4;
+ geo->cylinders = mddev->array_sectors / 8;
+ return 0;
+}
+
+static inline bool md_ioctl_valid(unsigned int cmd)
+{
+ switch (cmd) {
+ case ADD_NEW_DISK:
+ case BLKROSET:
+ case GET_ARRAY_INFO:
+ case GET_BITMAP_FILE:
+ case GET_DISK_INFO:
+ case HOT_ADD_DISK:
+ case HOT_REMOVE_DISK:
+ case RAID_AUTORUN:
+ case RAID_VERSION:
+ case RESTART_ARRAY_RW:
+ case RUN_ARRAY:
+ case SET_ARRAY_INFO:
+ case SET_BITMAP_FILE:
+ case SET_DISK_FAULTY:
+ case STOP_ARRAY:
+ case STOP_ARRAY_RO:
+ case CLUSTERED_DISK_NACK:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int md_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ int err = 0;
+ void __user *argp = (void __user *)arg;
+ struct mddev *mddev = NULL;
+ int ro;
+
+ if (!md_ioctl_valid(cmd))
+ return -ENOTTY;
+
+ switch (cmd) {
+ case RAID_VERSION:
+ case GET_ARRAY_INFO:
+ case GET_DISK_INFO:
+ break;
+ default:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd) {
+ case RAID_VERSION:
+ err = get_version(argp);
+ goto out;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays(arg);
+ goto out;
+#endif
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = bdev->bd_disk->private_data;
+
+ if (!mddev) {
+ BUG();
+ goto out;
+ }
+
+ /* Some actions do not requires the mutex */
+ switch (cmd) {
+ case GET_ARRAY_INFO:
+ if (!mddev->raid_disks && !mddev->external)
+ err = -ENODEV;
+ else
+ err = get_array_info(mddev, argp);
+ goto out;
+
+ case GET_DISK_INFO:
+ if (!mddev->raid_disks && !mddev->external)
+ err = -ENODEV;
+ else
+ err = get_disk_info(mddev, argp);
+ goto out;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, new_decode_dev(arg));
+ goto out;
+
+ case GET_BITMAP_FILE:
+ err = get_bitmap_file(mddev, argp);
+ goto out;
+
+ }
+
+ if (cmd == ADD_NEW_DISK)
+ /* need to ensure md_delayed_delete() has completed */
+ flush_workqueue(md_misc_wq);
+
+ if (cmd == HOT_REMOVE_DISK)
+ /* need to ensure recovery thread has run */
+ wait_event_interruptible_timeout(mddev->sb_wait,
+ !test_bit(MD_RECOVERY_NEEDED,
+ &mddev->flags),
+ msecs_to_jiffies(5000));
+ if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
+ /* Need to flush page cache, and ensure no-one else opens
+ * and writes
+ */
+ mutex_lock(&mddev->open_mutex);
+ if (mddev->pers && atomic_read(&mddev->openers) > 1) {
+ mutex_unlock(&mddev->open_mutex);
+ err = -EBUSY;
+ goto out;
+ }
+ set_bit(MD_STILL_CLOSED, &mddev->flags);
+ mutex_unlock(&mddev->open_mutex);
+ sync_blockdev(bdev);
+ }
+ err = mddev_lock(mddev);
+ if (err) {
+ printk(KERN_INFO
+ "md: ioctl lock interrupted, reason %d, cmd %d\n",
+ err, cmd);
+ goto out;
+ }
+
+ if (cmd == SET_ARRAY_INFO) {
+ mdu_array_info_t info;
+ if (!arg)
+ memset(&info, 0, sizeof(info));
+ else if (copy_from_user(&info, argp, sizeof(info))) {
+ err = -EFAULT;
+ goto unlock;
+ }
+ if (mddev->pers) {
+ err = update_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't update"
+ " array info. %d\n", err);
+ goto unlock;
+ }
+ goto unlock;
+ }
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: array %s already has disks!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto unlock;
+ }
+ if (mddev->raid_disks) {
+ printk(KERN_WARNING
+ "md: array %s already initialised!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't set"
+ " array info. %d\n", err);
+ goto unlock;
+ }
+ goto unlock;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
+ * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
+ if ((!mddev->raid_disks && !mddev->external)
+ && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
+ && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
+ && cmd != GET_BITMAP_FILE) {
+ err = -ENODEV;
+ goto unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd) {
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto unlock;
+
+ case STOP_ARRAY:
+ err = do_md_stop(mddev, 0, bdev);
+ goto unlock;
+
+ case STOP_ARRAY_RO:
+ err = md_set_readonly(mddev, bdev);
+ goto unlock;
+
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, new_decode_dev(arg));
+ goto unlock;
+
+ case ADD_NEW_DISK:
+ /* We can support ADD_NEW_DISK on read-only arrays
+ * on if we are re-adding a preexisting device.
+ * So require mddev->pers and MD_DISK_SYNC.
+ */
+ if (mddev->pers) {
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, argp, sizeof(info)))
+ err = -EFAULT;
+ else if (!(info.state & (1<<MD_DISK_SYNC)))
+ /* Need to clear read-only for this */
+ break;
+ else
+ err = add_new_disk(mddev, &info);
+ goto unlock;
+ }
+ break;
+
+ case BLKROSET:
+ if (get_user(ro, (int __user *)(arg))) {
+ err = -EFAULT;
+ goto unlock;
+ }
+ err = -EINVAL;
+
+ /* if the bdev is going readonly the value of mddev->ro
+ * does not matter, no writes are coming
+ */
+ if (ro)
+ goto unlock;
+
+ /* are we are already prepared for writes? */
+ if (mddev->ro != 1)
+ goto unlock;
+
+ /* transitioning to readauto need only happen for
+ * arrays that call md_write_start
+ */
+ if (mddev->pers) {
+ err = restart_array(mddev);
+ if (err == 0) {
+ mddev->ro = 2;
+ set_disk_ro(mddev->gendisk, 0);
+ }
+ }
+ goto unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow them on read-only arrays.
+ */
+ if (mddev->ro && mddev->pers) {
+ if (mddev->ro == 2) {
+ mddev->ro = 0;
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ /* mddev_unlock will wake thread */
+ /* If a device failed while we were read-only, we
+ * need to make sure the metadata is updated now.
+ */
+ if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+ mddev_unlock(mddev);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags));
+ mddev_lock_nointr(mddev);
+ }
+ } else {
+ err = -EROFS;
+ goto unlock;
+ }
+ }
+
+ switch (cmd) {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, argp, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto unlock;
+ }
+
+ case CLUSTERED_DISK_NACK:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->new_disk_ack(mddev, false);
+ else
+ err = -EINVAL;
+ goto unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, new_decode_dev(arg));
+ goto unlock;
+
+ case RUN_ARRAY:
+ err = do_md_run(mddev);
+ goto unlock;
+
+ case SET_BITMAP_FILE:
+ err = set_bitmap_file(mddev, (int)arg);
+ goto unlock;
+
+ default:
+ err = -EINVAL;
+ goto unlock;
+ }
+
+unlock:
+ if (mddev->hold_active == UNTIL_IOCTL &&
+ err != -EINVAL)
+ mddev->hold_active = 0;
+ mddev_unlock(mddev);
+out:
+ return err;
+}
+#ifdef CONFIG_COMPAT
+static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case HOT_REMOVE_DISK:
+ case HOT_ADD_DISK:
+ case SET_DISK_FAULTY:
+ case SET_BITMAP_FILE:
+ /* These take in integer arg, do not convert */
+ break;
+ default:
+ arg = (unsigned long)compat_ptr(arg);
+ break;
+ }
+
+ return md_ioctl(bdev, mode, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
+static int md_open(struct block_device *bdev, fmode_t mode)
+{
+ /*
+ * Succeed if we can lock the mddev, which confirms that
+ * it isn't being stopped right now.
+ */
+ struct mddev *mddev = mddev_find(bdev->bd_dev);
+ int err;
+
+ if (!mddev)
+ return -ENODEV;
+
+ if (mddev->gendisk != bdev->bd_disk) {
+ /* we are racing with mddev_put which is discarding this
+ * bd_disk.
+ */
+ mddev_put(mddev);
+ /* Wait until bdev->bd_disk is definitely gone */
+ flush_workqueue(md_misc_wq);
+ /* Then retry the open from the top */
+ return -ERESTARTSYS;
+ }
+ BUG_ON(mddev != bdev->bd_disk->private_data);
+
+ if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
+ goto out;
+
+ err = 0;
+ atomic_inc(&mddev->openers);
+ clear_bit(MD_STILL_CLOSED, &mddev->flags);
+ mutex_unlock(&mddev->open_mutex);
+
+ check_disk_change(bdev);
+ out:
+ return err;
+}
+
+static void md_release(struct gendisk *disk, fmode_t mode)
+{
+ struct mddev *mddev = disk->private_data;
+
+ BUG_ON(!mddev);
+ atomic_dec(&mddev->openers);
+ mddev_put(mddev);
+}
+
+static int md_media_changed(struct gendisk *disk)
+{
+ struct mddev *mddev = disk->private_data;
+
+ return mddev->changed;
+}
+
+static int md_revalidate(struct gendisk *disk)
+{
+ struct mddev *mddev = disk->private_data;
+
+ mddev->changed = 0;
+ return 0;
+}
+static const struct block_device_operations md_fops =
+{
+ .owner = THIS_MODULE,
+ .open = md_open,
+ .release = md_release,
+ .ioctl = md_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = md_compat_ioctl,
+#endif
+ .getgeo = md_getgeo,
+ .media_changed = md_media_changed,
+ .revalidate_disk= md_revalidate,
+};
+
+static int md_thread(void *arg)
+{
+ struct md_thread *thread = arg;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+
+ allow_signal(SIGKILL);
+ while (!kthread_should_stop()) {
+
+ /* We need to wait INTERRUPTIBLE so that
+ * we don't add to the load-average.
+ * That means we need to be sure no signals are
+ * pending
+ */
+ if (signal_pending(current))
+ flush_signals(current);
+
+ wait_event_interruptible_timeout
+ (thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags)
+ || kthread_should_stop(),
+ thread->timeout);
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+ if (!kthread_should_stop())
+ thread->run(thread);
+ }
+
+ return 0;
+}
+
+void md_wakeup_thread(struct md_thread *thread)
+{
+ if (thread) {
+ pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+ }
+}
+EXPORT_SYMBOL(md_wakeup_thread);
+
+struct md_thread *md_register_thread(void (*run) (struct md_thread *),
+ struct mddev *mddev, const char *name)
+{
+ struct md_thread *thread;
+
+ thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ init_waitqueue_head(&thread->wqueue);
+
+ thread->run = run;
+ thread->mddev = mddev;
+ thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ thread->tsk = kthread_run(md_thread, thread,
+ "%s_%s",
+ mdname(thread->mddev),
+ name);
+ if (IS_ERR(thread->tsk)) {
+ kfree(thread);
+ return NULL;
+ }
+ return thread;
+}
+EXPORT_SYMBOL(md_register_thread);
+
+void md_unregister_thread(struct md_thread **threadp)
+{
+ struct md_thread *thread = *threadp;
+ if (!thread)
+ return;
+ pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
+ /* Locking ensures that mddev_unlock does not wake_up a
+ * non-existent thread
+ */
+ spin_lock(&pers_lock);
+ *threadp = NULL;
+ spin_unlock(&pers_lock);
+
+ kthread_stop(thread->tsk);
+ kfree(thread);
+}
+EXPORT_SYMBOL(md_unregister_thread);
+
+void md_error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ return;
+
+ if (!mddev->pers || !mddev->pers->error_handler)
+ return;
+ mddev->pers->error_handler(mddev,rdev);
+ if (mddev->degraded)
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ if (mddev->event_work.func)
+ queue_work(md_misc_wq, &mddev->event_work);
+ md_new_event_inintr(mddev);
+}
+EXPORT_SYMBOL(md_error);
+
+/* seq_file implementation /proc/mdstat */
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ struct md_rdev *rdev;
+
+ seq_printf(seq, "unused devices: ");
+
+ list_for_each_entry(rdev, &pending_raid_disks, same_set) {
+ char b[BDEVNAME_SIZE];
+ i++;
+ seq_printf(seq, "%s ",
+ bdevname(rdev->bdev,b));
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+static void status_resync(struct seq_file *seq, struct mddev *mddev)
+{
+ sector_t max_sectors, resync, res;
+ unsigned long dt, db;
+ sector_t rt;
+ int scale;
+ unsigned int per_milli;
+
+ if (mddev->curr_resync <= 3)
+ resync = 0;
+ else
+ resync = mddev->curr_resync
+ - atomic_read(&mddev->recovery_active);
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ max_sectors = mddev->resync_max_sectors;
+ else
+ max_sectors = mddev->dev_sectors;
+
+ WARN_ON(max_sectors == 0);
+ /* Pick 'scale' such that (resync>>scale)*1000 will fit
+ * in a sector_t, and (max_sectors>>scale) will fit in a
+ * u32, as those are the requirements for sector_div.
+ * Thus 'scale' must be at least 10
+ */
+ scale = 10;
+ if (sizeof(sector_t) > sizeof(unsigned long)) {
+ while ( max_sectors/2 > (1ULL<<(scale+32)))
+ scale++;
+ }
+ res = (resync>>scale)*1000;
+ sector_div(res, (u32)((max_sectors>>scale)+1));
+
+ per_milli = res;
+ {
+ int i, x = per_milli/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
+ (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
+ "reshape" :
+ (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
+ "check" :
+ (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+ "resync" : "recovery"))),
+ per_milli/10, per_milli % 10,
+ (unsigned long long) resync/2,
+ (unsigned long long) max_sectors/2);
+
+ /*
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ *
+ * rt is a sector_t, so could be 32bit or 64bit.
+ * So we divide before multiply in case it is 32bit and close
+ * to the limit.
+ * We scale the divisor (db) by 32 to avoid losing precision
+ * near the end of resync when the number of remaining sectors
+ * is close to 'db'.
+ * We then divide rt by 32 after multiplying by db to compensate.
+ * The '+1' avoids division by zero if db is very small.
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
+ - mddev->resync_mark_cnt;
+
+ rt = max_sectors - resync; /* number of remaining sectors */
+ sector_div(rt, db/32+1);
+ rt *= dt;
+ rt >>= 5;
+
+ seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
+ ((unsigned long)rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/2/dt);
+}
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ struct mddev *mddev;
+
+ if (l >= 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, struct mddev, all_mddevs);
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ return mddev;
+ }
+ spin_unlock(&all_mddevs_lock);
+ if (!l--)
+ return (void*)2;/* tail */
+ return NULL;
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ struct mddev *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ spin_lock(&all_mddevs_lock);
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ if (v != (void*)1)
+ mddev_put(mddev);
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+ struct mddev *mddev = v;
+
+ if (mddev && v != (void*)1 && v != (void*)2)
+ mddev_put(mddev);
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ struct mddev *mddev = v;
+ sector_t sectors;
+ struct md_rdev *rdev;
+
+ if (v == (void*)1) {
+ struct md_personality *pers;
+ seq_printf(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ list_for_each_entry(pers, &pers_list, list)
+ seq_printf(seq, "[%s] ", pers->name);
+
+ spin_unlock(&pers_lock);
+ seq_printf(seq, "\n");
+ seq->poll_event = atomic_read(&md_event_count);
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ spin_lock(&mddev->lock);
+ if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+ seq_printf(seq, "%s : %sactive", mdname(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro==1)
+ seq_printf(seq, " (read-only)");
+ if (mddev->ro==2)
+ seq_printf(seq, " (auto-read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ sectors = 0;
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ char b[BDEVNAME_SIZE];
+ seq_printf(seq, " %s[%d]",
+ bdevname(rdev->bdev,b), rdev->desc_nr);
+ if (test_bit(WriteMostly, &rdev->flags))
+ seq_printf(seq, "(W)");
+ if (test_bit(Faulty, &rdev->flags)) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ if (rdev->raid_disk < 0)
+ seq_printf(seq, "(S)"); /* spare */
+ if (test_bit(Replacement, &rdev->flags))
+ seq_printf(seq, "(R)");
+ sectors += rdev->sectors;
+ }
+ rcu_read_unlock();
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)
+ mddev->array_sectors / 2);
+ else
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)sectors / 2);
+ }
+ if (mddev->persistent) {
+ if (mddev->major_version != 0 ||
+ mddev->minor_version != 90) {
+ seq_printf(seq," super %d.%d",
+ mddev->major_version,
+ mddev->minor_version);
+ }
+ } else if (mddev->external)
+ seq_printf(seq, " super external:%s",
+ mddev->metadata_type);
+ else
+ seq_printf(seq, " super non-persistent");
+
+ if (mddev->pers) {
+ mddev->pers->status(seq, mddev);
+ seq_printf(seq, "\n ");
+ if (mddev->pers->sync_request) {
+ if (mddev->curr_resync > 2) {
+ status_resync(seq, mddev);
+ seq_printf(seq, "\n ");
+ } else if (mddev->curr_resync >= 1)
+ seq_printf(seq, "\tresync=DELAYED\n ");
+ else if (mddev->recovery_cp < MaxSector)
+ seq_printf(seq, "\tresync=PENDING\n ");
+ }
+ } else
+ seq_printf(seq, "\n ");
+
+ bitmap_status(seq, mddev->bitmap);
+
+ seq_printf(seq, "\n");
+ }
+ spin_unlock(&mddev->lock);
+
+ return 0;
+}
+
+static const struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ if (error)
+ return error;
+
+ seq = file->private_data;
+ seq->poll_event = atomic_read(&md_event_count);
+ return error;
+}
+
+static int md_unloading;
+static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
+{
+ struct seq_file *seq = filp->private_data;
+ int mask;
+
+ if (md_unloading)
+ return POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+ poll_wait(filp, &md_event_waiters, wait);
+
+ /* always allow read */
+ mask = POLLIN | POLLRDNORM;
+
+ if (seq->poll_event != atomic_read(&md_event_count))
+ mask |= POLLERR | POLLPRI;
+ return mask;
+}
+
+static const struct file_operations md_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+ .poll = mdstat_poll,
+};
+
+int register_md_personality(struct md_personality *p)
+{
+ printk(KERN_INFO "md: %s personality registered for level %d\n",
+ p->name, p->level);
+ spin_lock(&pers_lock);
+ list_add_tail(&p->list, &pers_list);
+ spin_unlock(&pers_lock);
+ return 0;
+}
+EXPORT_SYMBOL(register_md_personality);
+
+int unregister_md_personality(struct md_personality *p)
+{
+ printk(KERN_INFO "md: %s personality unregistered\n", p->name);
+ spin_lock(&pers_lock);
+ list_del_init(&p->list);
+ spin_unlock(&pers_lock);
+ return 0;
+}
+EXPORT_SYMBOL(unregister_md_personality);
+
+int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
+{
+ if (md_cluster_ops != NULL)
+ return -EALREADY;
+ spin_lock(&pers_lock);
+ md_cluster_ops = ops;
+ md_cluster_mod = module;
+ spin_unlock(&pers_lock);
+ return 0;
+}
+EXPORT_SYMBOL(register_md_cluster_operations);
+
+int unregister_md_cluster_operations(void)
+{
+ spin_lock(&pers_lock);
+ md_cluster_ops = NULL;
+ spin_unlock(&pers_lock);
+ return 0;
+}
+EXPORT_SYMBOL(unregister_md_cluster_operations);
+
+int md_setup_cluster(struct mddev *mddev, int nodes)
+{
+ int err;
+
+ err = request_module("md-cluster");
+ if (err) {
+ pr_err("md-cluster module not found.\n");
+ return -ENOENT;
+ }
+
+ spin_lock(&pers_lock);
+ if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+ spin_unlock(&pers_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&pers_lock);
+
+ return md_cluster_ops->join(mddev, nodes);
+}
+
+void md_cluster_stop(struct mddev *mddev)
+{
+ if (!md_cluster_ops)
+ return;
+ md_cluster_ops->leave(mddev);
+ module_put(md_cluster_mod);
+}
+
+static int is_mddev_idle(struct mddev *mddev, int init)
+{
+ struct md_rdev *rdev;
+ int idle;
+ int curr_events;
+
+ idle = 1;
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+ curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]) -
+ atomic_read(&disk->sync_io);
+ /* sync IO will cause sync_io to increase before the disk_stats
+ * as sync_io is counted when a request starts, and
+ * disk_stats is counted when it completes.
+ * So resync activity will cause curr_events to be smaller than
+ * when there was no such activity.
+ * non-sync IO will cause disk_stat to increase without
+ * increasing sync_io so curr_events will (eventually)
+ * be larger than it was before. Once it becomes
+ * substantially larger, the test below will cause
+ * the array to appear non-idle, and resync will slow
+ * down.
+ * If there is a lot of outstanding resync activity when
+ * we set last_event to curr_events, then all that activity
+ * completing might cause the array to appear non-idle
+ * and resync will be slowed down even though there might
+ * not have been non-resync activity. This will only
+ * happen once though. 'last_events' will soon reflect
+ * the state where there is little or no outstanding
+ * resync requests, and further resync activity will
+ * always make curr_events less than last_events.
+ *
+ */
+ if (init || curr_events - rdev->last_events > 64) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ rcu_read_unlock();
+ return idle;
+}
+
+void md_done_sync(struct mddev *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ // stop recovery, signal do_sync ....
+ }
+}
+EXPORT_SYMBOL(md_done_sync);
+
+/* md_write_start(mddev, bi)
+ * If we need to update some array metadata (e.g. 'active' flag
+ * in superblock) before writing, schedule a superblock update
+ * and wait for it to complete.
+ */
+void md_write_start(struct mddev *mddev, struct bio *bi)
+{
+ int did_change = 0;
+ if (bio_data_dir(bi) != WRITE)
+ return;
+
+ BUG_ON(mddev->ro == 1);
+ if (mddev->ro == 2) {
+ /* need to switch to read/write */
+ mddev->ro = 0;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ md_wakeup_thread(mddev->sync_thread);
+ did_change = 1;
+ }
+ atomic_inc(&mddev->writes_pending);
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ if (mddev->in_sync) {
+ spin_lock(&mddev->lock);
+ if (mddev->in_sync) {
+ mddev->in_sync = 0;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ did_change = 1;
+ }
+ spin_unlock(&mddev->lock);
+ }
+ if (did_change)
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags));
+}
+EXPORT_SYMBOL(md_write_start);
+
+void md_write_end(struct mddev *mddev)
+{
+ if (atomic_dec_and_test(&mddev->writes_pending)) {
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else if (mddev->safemode_delay)
+ mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+ }
+}
+EXPORT_SYMBOL(md_write_end);
+
+/* md_allow_write(mddev)
+ * Calling this ensures that the array is marked 'active' so that writes
+ * may proceed without blocking. It is important to call this before
+ * attempting a GFP_KERNEL allocation while holding the mddev lock.
+ * Must be called with mddev_lock held.
+ *
+ * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * is dropped, so return -EAGAIN after notifying userspace.
+ */
+int md_allow_write(struct mddev *mddev)
+{
+ if (!mddev->pers)
+ return 0;
+ if (mddev->ro)
+ return 0;
+ if (!mddev->pers->sync_request)
+ return 0;
+
+ spin_lock(&mddev->lock);
+ if (mddev->in_sync) {
+ mddev->in_sync = 0;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
+ if (mddev->safemode_delay &&
+ mddev->safemode == 0)
+ mddev->safemode = 1;
+ spin_unlock(&mddev->lock);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+ md_update_sb(mddev, 0);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ } else
+ spin_unlock(&mddev->lock);
+
+ if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
+ return -EAGAIN;
+ else
+ return 0;
+}
+EXPORT_SYMBOL_GPL(md_allow_write);
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+#define UPDATE_FREQUENCY (5*60*HZ)
+void md_do_sync(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct mddev *mddev2;
+ unsigned int currspeed = 0,
+ window;
+ sector_t max_sectors,j, io_sectors, recovery_done;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long update_time;
+ sector_t mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct list_head *tmp;
+ sector_t last_check;
+ int skipped = 0;
+ struct md_rdev *rdev;
+ char *desc, *action = NULL;
+ struct blk_plug plug;
+
+ /* just incase thread restarts... */
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ return;
+ if (mddev->ro) {/* never try to sync a read-only array */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ return;
+ }
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+ desc = "data-check";
+ action = "check";
+ } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ desc = "requested-resync";
+ action = "repair";
+ } else
+ desc = "resync";
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ desc = "reshape";
+ else
+ desc = "recovery";
+
+ mddev->last_sync_action = action ?: desc;
+
+ /* we overload curr_resync somewhat here.
+ * 0 == not engaged in resync at all
+ * 2 == checking that there is no conflict with another sync
+ * 1 == like 2, but have yielded to allow conflicting resync to
+ * commense
+ * other == active in resync - this many blocks
+ *
+ * Before starting a resync we must have set curr_resync to
+ * 2, and then checked that every "conflicting" array has curr_resync
+ * less than ours. When we find one that is the same or higher
+ * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
+ * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
+ * This will mean we have to start checking from the beginning again.
+ *
+ */
+
+ do {
+ mddev->curr_resync = 2;
+
+ try_again:
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ goto skip;
+ for_each_mddev(mddev2, tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (!mddev->parallel_resync
+ && mddev2->curr_resync
+ && match_mddev_units(mddev, mddev2)) {
+ DEFINE_WAIT(wq);
+ if (mddev < mddev2 && mddev->curr_resync == 2) {
+ /* arbitrarily yield */
+ mddev->curr_resync = 1;
+ wake_up(&resync_wait);
+ }
+ if (mddev > mddev2 && mddev->curr_resync == 1)
+ /* no need to wait here, we can wait the next
+ * time 'round when curr_resync == 2
+ */
+ continue;
+ /* We need to wait 'interruptible' so as not to
+ * contribute to the load average, and not to
+ * be caught by 'softlockup'
+ */
+ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ mddev2->curr_resync >= mddev->curr_resync) {
+ printk(KERN_INFO "md: delaying %s of %s"
+ " until %s has finished (they"
+ " share one or more physical units)\n",
+ desc, mdname(mddev), mdname(mddev2));
+ mddev_put(mddev2);
+ if (signal_pending(current))
+ flush_signals(current);
+ schedule();
+ finish_wait(&resync_wait, &wq);
+ goto try_again;
+ }
+ finish_wait(&resync_wait, &wq);
+ }
+ }
+ } while (mddev->curr_resync < 2);
+
+ j = 0;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /* resync follows the size requested by the personality,
+ * which defaults to physical size, but can be virtual size
+ */
+ max_sectors = mddev->resync_max_sectors;
+ atomic64_set(&mddev->resync_mismatches, 0);
+ /* we don't use the checkpoint if there's a bitmap */
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ j = mddev->resync_min;
+ else if (!mddev->bitmap)
+ j = mddev->recovery_cp;
+
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ max_sectors = mddev->resync_max_sectors;
+ else {
+ /* recovery follows the physical size of devices */
+ max_sectors = mddev->dev_sectors;
+ j = MaxSector;
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < j)
+ j = rdev->recovery_offset;
+ rcu_read_unlock();
+
+ /* If there is a bitmap, we need to make sure all
+ * writes that started before we added a spare
+ * complete before we start doing a recovery.
+ * Otherwise the write might complete and (via
+ * bitmap_endwrite) set a bit in the bitmap after the
+ * recovery has checked that bit and skipped that
+ * region.
+ */
+ if (mddev->bitmap) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ }
+
+ printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ speed:"
+ " %d KB/sec/disk.\n", speed_min(mddev));
+ printk(KERN_INFO "md: using maximum available idle IO bandwidth "
+ "(but not more than %d KB/sec) for %s.\n",
+ speed_max(mddev), desc);
+
+ is_mddev_idle(mddev, 1); /* this initializes IO event counters */
+
+ io_sectors = 0;
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = io_sectors;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = 32*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
+ window/2, (unsigned long long)max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ last_check = 0;
+
+ if (j>2) {
+ printk(KERN_INFO
+ "md: resuming %s of %s from checkpoint.\n",
+ desc, mdname(mddev));
+ mddev->curr_resync = j;
+ } else
+ mddev->curr_resync = 3; /* no longer delayed */
+ mddev->curr_resync_completed = j;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ md_new_event(mddev);
+ update_time = jiffies;
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_start(mddev, j, max_sectors);
+
+ blk_start_plug(&plug);
+ while (j < max_sectors) {
+ sector_t sectors;
+
+ skipped = 0;
+
+ if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ ((mddev->curr_resync > mddev->curr_resync_completed &&
+ (mddev->curr_resync - mddev->curr_resync_completed)
+ > (max_sectors >> 4)) ||
+ time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
+ (j - mddev->curr_resync_completed)*2
+ >= mddev->resync_max - mddev->curr_resync_completed
+ )) {
+ /* time to update curr_resync_completed */
+ wait_event(mddev->recovery_wait,
+ atomic_read(&mddev->recovery_active) == 0);
+ mddev->curr_resync_completed = j;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ j > mddev->recovery_cp)
+ mddev->recovery_cp = j;
+ update_time = jiffies;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ }
+
+ while (j >= mddev->resync_max &&
+ !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ /* As this condition is controlled by user-space,
+ * we can block indefinitely, so use '_interruptible'
+ * to avoid triggering warnings.
+ */
+ flush_signals(current); /* just in case */
+ wait_event_interruptible(mddev->recovery_wait,
+ mddev->resync_max > j
+ || test_bit(MD_RECOVERY_INTR,
+ &mddev->recovery));
+ }
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ break;
+
+ sectors = mddev->pers->sync_request(mddev, j, &skipped);
+ if (sectors == 0) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ break;
+ }
+
+ if (!skipped) { /* actual IO requested */
+ io_sectors += sectors;
+ atomic_add(sectors, &mddev->recovery_active);
+ }
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ break;
+
+ j += sectors;
+ if (j > 2)
+ mddev->curr_resync = j;
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_info_update(mddev, j, max_sectors);
+ mddev->curr_mark_cnt = io_sectors;
+ if (last_check == 0)
+ /* this is the earliest that rebuild will be
+ * visible in /proc/mdstat
+ */
+ md_new_event(mddev);
+
+ if (last_check + window > io_sectors || j == max_sectors)
+ continue;
+
+ last_check = io_sectors;
+ repeat:
+ if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ break;
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ cond_resched();
+
+ recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
+ currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
+ /((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > speed_min(mddev)) {
+ if (currspeed > speed_max(mddev)) {
+ msleep(500);
+ goto repeat;
+ }
+ if (!is_mddev_idle(mddev, 0)) {
+ /*
+ * Give other IO more of a chance.
+ * The faster the devices, the less we wait.
+ */
+ wait_event(mddev->recovery_wait,
+ !atomic_read(&mddev->recovery_active));
+ }
+ }
+ }
+ printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery)
+ ? "interrupted" : "done");
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+ blk_finish_plug(&plug);
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ /* tell personality that we are finished */
+ mddev->pers->sync_request(mddev, max_sectors, &skipped);
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_finish(mddev);
+
+ if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
+ mddev->curr_resync > 2) {
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ if (mddev->curr_resync >= mddev->recovery_cp) {
+ printk(KERN_INFO
+ "md: checkpointing %s of %s.\n",
+ desc, mdname(mddev));
+ if (test_bit(MD_RECOVERY_ERROR,
+ &mddev->recovery))
+ mddev->recovery_cp =
+ mddev->curr_resync_completed;
+ else
+ mddev->recovery_cp =
+ mddev->curr_resync;
+ }
+ } else
+ mddev->recovery_cp = MaxSector;
+ } else {
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ mddev->curr_resync = MaxSector;
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (rdev->raid_disk >= 0 &&
+ mddev->delta_disks >= 0 &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < mddev->curr_resync)
+ rdev->recovery_offset = mddev->curr_resync;
+ rcu_read_unlock();
+ }
+ }
+ skip:
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+
+ spin_lock(&mddev->lock);
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ /* We completed so min/max setting can be forgotten if used. */
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ mddev->resync_min = 0;
+ mddev->resync_max = MaxSector;
+ } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ mddev->resync_min = mddev->curr_resync_completed;
+ mddev->curr_resync = 0;
+ spin_unlock(&mddev->lock);
+
+ wake_up(&resync_wait);
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ return;
+}
+EXPORT_SYMBOL_GPL(md_do_sync);
+
+static int remove_and_add_spares(struct mddev *mddev,
+ struct md_rdev *this)
+{
+ struct md_rdev *rdev;
+ int spares = 0;
+ int removed = 0;
+
+ rdev_for_each(rdev, mddev)
+ if ((this == NULL || rdev == this) &&
+ rdev->raid_disk >= 0 &&
+ !test_bit(Blocked, &rdev->flags) &&
+ (test_bit(Faulty, &rdev->flags) ||
+ ! test_bit(In_sync, &rdev->flags)) &&
+ atomic_read(&rdev->nr_pending)==0) {
+ if (mddev->pers->hot_remove_disk(
+ mddev, rdev) == 0) {
+ sysfs_unlink_rdev(mddev, rdev);
+ rdev->raid_disk = -1;
+ removed++;
+ }
+ }
+ if (removed && mddev->kobj.sd)
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
+
+ if (this)
+ goto no_add;
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags))
+ spares++;
+ if (rdev->raid_disk >= 0)
+ continue;
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (mddev->ro &&
+ ! (rdev->saved_raid_disk >= 0 &&
+ !test_bit(Bitmap_sync, &rdev->flags)))
+ continue;
+
+ if (rdev->saved_raid_disk < 0)
+ rdev->recovery_offset = 0;
+ if (mddev->pers->
+ hot_add_disk(mddev, rdev) == 0) {
+ if (sysfs_link_rdev(mddev, rdev))
+ /* failure here is OK */;
+ spares++;
+ md_new_event(mddev);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ }
+ }
+no_add:
+ if (removed)
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ return spares;
+}
+
+static void md_start_sync(struct work_struct *ws)
+{
+ struct mddev *mddev = container_of(ws, struct mddev, del_work);
+
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "%s: could not start resync"
+ " thread...\n",
+ mdname(mddev));
+ /* leave the spares where they are, it shouldn't hurt */
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ wake_up(&resync_wait);
+ if (test_and_clear_bit(MD_RECOVERY_RECOVER,
+ &mddev->recovery))
+ if (mddev->sysfs_action)
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ } else
+ md_wakeup_thread(mddev->sync_thread);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ md_new_event(mddev);
+}
+
+/*
+ * This routine is regularly called by all per-raid-array threads to
+ * deal with generic issues like resync and super-block update.
+ * Raid personalities that don't have a thread (linear/raid0) do not
+ * need this as they never do any recovery or update the superblock.
+ *
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+ * "->recovery" and create a thread at ->sync_thread.
+ * When the thread finishes it sets MD_RECOVERY_DONE
+ * and wakeups up this thread which will reap the thread and finish up.
+ * This thread also removes any faulty devices (with nr_pending == 0).
+ *
+ * The overall approach is:
+ * 1/ if the superblock needs updating, update it.
+ * 2/ If a recovery thread is running, don't do anything else.
+ * 3/ If recovery has finished, clean up, possibly marking spares active.
+ * 4/ If there are any faulty devices, remove them.
+ * 5/ If array is degraded, try to add spares devices
+ * 6/ If array has spares or is not in-sync, start a resync thread.
+ */
+void md_check_recovery(struct mddev *mddev)
+{
+ if (mddev->suspended)
+ return;
+
+ if (mddev->bitmap)
+ bitmap_daemon_work(mddev);
+
+ if (signal_pending(current)) {
+ if (mddev->pers->sync_request && !mddev->external) {
+ printk(KERN_INFO "md: %s in immediate safe mode\n",
+ mdname(mddev));
+ mddev->safemode = 2;
+ }
+ flush_signals(current);
+ }
+
+ if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ return;
+ if ( ! (
+ (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+ (mddev->external == 0 && mddev->safemode == 1) ||
+ (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+ && !mddev->in_sync && mddev->recovery_cp == MaxSector)
+ ))
+ return;
+
+ if (mddev_trylock(mddev)) {
+ int spares = 0;
+
+ if (mddev->ro) {
+ /* On a read-only array we can:
+ * - remove failed devices
+ * - add already-in_sync devices if the array itself
+ * is in-sync.
+ * As we only add devices that are already in-sync,
+ * we can activate the spares immediately.
+ */
+ remove_and_add_spares(mddev, NULL);
+ /* There is no thread, but we need to call
+ * ->spare_active and clear saved_raid_disk
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_reap_sync_thread(mddev);
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ goto unlock;
+ }
+
+ if (!mddev->external) {
+ int did_change = 0;
+ spin_lock(&mddev->lock);
+ if (mddev->safemode &&
+ !atomic_read(&mddev->writes_pending) &&
+ !mddev->in_sync &&
+ mddev->recovery_cp == MaxSector) {
+ mddev->in_sync = 1;
+ did_change = 1;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ spin_unlock(&mddev->lock);
+ if (did_change)
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ }
+
+ if (mddev->flags & MD_UPDATE_SB_FLAGS) {
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+ md_update_sb(mddev, 0);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ }
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
+ /* resync/recovery still happening */
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ goto unlock;
+ }
+ if (mddev->sync_thread) {
+ md_reap_sync_thread(mddev);
+ goto unlock;
+ }
+ /* Set RUNNING before clearing NEEDED to avoid
+ * any transients in the value of "sync_action".
+ */
+ mddev->curr_resync_completed = 0;
+ spin_lock(&mddev->lock);
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ spin_unlock(&mddev->lock);
+ /* Clear some bits that don't mean anything, but
+ * might be left set
+ */
+ clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
+
+ if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ goto not_running;
+ /* no recovery is running.
+ * remove any failed drives, then
+ * add spares if possible.
+ * Spares are also removed and re-added, to allow
+ * the personality to fail the re-add.
+ */
+
+ if (mddev->reshape_position != MaxSector) {
+ if (mddev->pers->check_reshape == NULL ||
+ mddev->pers->check_reshape(mddev) != 0)
+ /* Cannot proceed */
+ goto not_running;
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ } else if ((spares = remove_and_add_spares(mddev, NULL))) {
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ } else if (mddev->recovery_cp < MaxSector) {
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ /* nothing to be done ... */
+ goto not_running;
+
+ if (mddev->pers->sync_request) {
+ if (spares) {
+ /* We are adding a device or devices to an array
+ * which has the bitmap stored on all devices.
+ * So make sure all bitmap pages get written
+ */
+ bitmap_write_all(mddev->bitmap);
+ }
+ INIT_WORK(&mddev->del_work, md_start_sync);
+ queue_work(md_misc_wq, &mddev->del_work);
+ goto unlock;
+ }
+ not_running:
+ if (!mddev->sync_thread) {
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ wake_up(&resync_wait);
+ if (test_and_clear_bit(MD_RECOVERY_RECOVER,
+ &mddev->recovery))
+ if (mddev->sysfs_action)
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ }
+ unlock:
+ wake_up(&mddev->sb_wait);
+ mddev_unlock(mddev);
+ }
+}
+EXPORT_SYMBOL(md_check_recovery);
+
+void md_reap_sync_thread(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ /* resync has finished, collect result */
+ md_unregister_thread(&mddev->sync_thread);
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ if (mddev->pers->spare_active(mddev)) {
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ }
+ }
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape(mddev);
+
+ /* If array is no-longer degraded, then any saved_raid_disk
+ * information must be scrapped.
+ */
+ if (!mddev->degraded)
+ rdev_for_each(rdev, mddev)
+ rdev->saved_raid_disk = -1;
+
+ md_update_sb(mddev, 1);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ wake_up(&resync_wait);
+ /* flag recovery needed just to double check */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ md_new_event(mddev);
+ if (mddev->event_work.func)
+ queue_work(md_misc_wq, &mddev->event_work);
+}
+EXPORT_SYMBOL(md_reap_sync_thread);
+
+void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
+{
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ wait_event_timeout(rdev->blocked_wait,
+ !test_bit(Blocked, &rdev->flags) &&
+ !test_bit(BlockedBadBlocks, &rdev->flags),
+ msecs_to_jiffies(5000));
+ rdev_dec_pending(rdev, mddev);
+}
+EXPORT_SYMBOL(md_wait_for_blocked_rdev);
+
+void md_finish_reshape(struct mddev *mddev)
+{
+ /* called be personality module when reshape completes. */
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->data_offset > rdev->new_data_offset)
+ rdev->sectors += rdev->data_offset - rdev->new_data_offset;
+ else
+ rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
+ rdev->data_offset = rdev->new_data_offset;
+ }
+}
+EXPORT_SYMBOL(md_finish_reshape);
+
+/* Bad block management.
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so md_is_badblock
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ * We return
+ * 0 if there are no known bad blocks in the range
+ * 1 if there are known bad block which are all acknowledged
+ * -1 if there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int hi;
+ int lo;
+ u64 *p = bb->page;
+ int rv;
+ sector_t target = s + sectors;
+ unsigned seq;
+
+ if (bb->shift > 0) {
+ /* round the start down, and the end up */
+ s >>= bb->shift;
+ target += (1<<bb->shift) - 1;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+ /* 'target' is now the first block after the bad range */
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+ lo = 0;
+ rv = 0;
+ hi = bb->count;
+
+ /* Binary search between lo and hi for 'target'
+ * i.e. for the last range that starts before 'target'
+ */
+ /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+ * are known not to be the last range before target.
+ * VARIANT: hi-lo is the number of possible
+ * ranges, and decreases until it reaches 1
+ */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+ if (a < target)
+ /* This could still be the one, earlier ranges
+ * could not. */
+ lo = mid;
+ else
+ /* This and later ranges are definitely out. */
+ hi = mid;
+ }
+ /* 'lo' might be the last that started before target, but 'hi' isn't */
+ if (hi > lo) {
+ /* need to check all range that end after 's' to see if
+ * any are unacknowledged.
+ */
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ if (BB_OFFSET(p[lo]) < target) {
+ /* starts before the end, and finishes after
+ * the start, so they must overlap
+ */
+ if (rv != -1 && BB_ACK(p[lo]))
+ rv = 1;
+ else
+ rv = -1;
+ *first_bad = BB_OFFSET(p[lo]);
+ *bad_sectors = BB_LEN(p[lo]);
+ }
+ lo--;
+ }
+ }
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(md_is_badblock);
+
+/*
+ * Add a range of bad blocks to the table.
+ * This might extend the table, or might contract it
+ * if two adjacent ranges can be merged.
+ * We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ */
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ u64 *p;
+ int lo, hi;
+ int rv = 1;
+ unsigned long flags;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 0;
+
+ if (bb->shift) {
+ /* round the start down, and the end up */
+ sector_t next = s + sectors;
+ s >>= bb->shift;
+ next += (1<<bb->shift) - 1;
+ next >>= bb->shift;
+ sectors = next - s;
+ }
+
+ write_seqlock_irqsave(&bb->lock, flags);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts at-or-before 's' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+ if (a <= s)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo && BB_OFFSET(p[lo]) > s)
+ hi = lo;
+
+ if (hi > lo) {
+ /* we found a range that might merge with the start
+ * of our new range
+ */
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t e = a + BB_LEN(p[lo]);
+ int ack = BB_ACK(p[lo]);
+ if (e >= s) {
+ /* Yes, we can merge with a previous range */
+ if (s == a && s + sectors >= e)
+ /* new range covers old */
+ ack = acknowledged;
+ else
+ ack = ack && acknowledged;
+
+ if (e < s + sectors)
+ e = s + sectors;
+ if (e - a <= BB_MAX_LEN) {
+ p[lo] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ /* does not all fit in one range,
+ * make p[lo] maximal
+ */
+ if (BB_LEN(p[lo]) != BB_MAX_LEN)
+ p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ }
+ }
+ if (sectors && hi < bb->count) {
+ /* 'hi' points to the first range that starts after 's'.
+ * Maybe we can merge with the start of that range */
+ sector_t a = BB_OFFSET(p[hi]);
+ sector_t e = a + BB_LEN(p[hi]);
+ int ack = BB_ACK(p[hi]);
+ if (a <= s + sectors) {
+ /* merging is possible */
+ if (e <= s + sectors) {
+ /* full overlap */
+ e = s + sectors;
+ ack = acknowledged;
+ } else
+ ack = ack && acknowledged;
+
+ a = s;
+ if (e - a <= BB_MAX_LEN) {
+ p[hi] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ lo = hi;
+ hi++;
+ }
+ }
+ if (sectors == 0 && hi < bb->count) {
+ /* we might be able to combine lo and hi */
+ /* Note: 's' is at the end of 'lo' */
+ sector_t a = BB_OFFSET(p[hi]);
+ int lolen = BB_LEN(p[lo]);
+ int hilen = BB_LEN(p[hi]);
+ int newlen = lolen + hilen - (s - a);
+ if (s >= a && newlen < BB_MAX_LEN) {
+ /* yes, we can combine them */
+ int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+ p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+ memmove(p + hi, p + hi + 1,
+ (bb->count - hi - 1) * 8);
+ bb->count--;
+ }
+ }
+ while (sectors) {
+ /* didn't merge (it all).
+ * Need to add a range just before 'hi' */
+ if (bb->count >= MD_MAX_BADBLOCKS) {
+ /* No room for more */
+ rv = 0;
+ break;
+ } else {
+ int this_sectors = sectors;
+ memmove(p + hi + 1, p + hi,
+ (bb->count - hi) * 8);
+ bb->count++;
+
+ if (this_sectors > BB_MAX_LEN)
+ this_sectors = BB_MAX_LEN;
+ p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+ sectors -= this_sectors;
+ s += this_sectors;
+ }
+ }
+
+ bb->changed = 1;
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ write_sequnlock_irqrestore(&bb->lock, flags);
+
+ return rv;
+}
+
+int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
+{
+ int rv;
+ if (is_new)
+ s += rdev->new_data_offset;
+ else
+ s += rdev->data_offset;
+ rv = md_set_badblocks(&rdev->badblocks,
+ s, sectors, 0);
+ if (rv) {
+ /* Make sure they get written out promptly */
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+ md_wakeup_thread(rdev->mddev->thread);
+ }
+ return rv;
+}
+EXPORT_SYMBOL_GPL(rdev_set_badblocks);
+
+/*
+ * Remove a range of bad blocks from the table.
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ */
+static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
+{
+ u64 *p;
+ int lo, hi;
+ sector_t target = s + sectors;
+ int rv = 0;
+
+ if (bb->shift > 0) {
+ /* When clearing we round the start up and the end down.
+ * This should not matter as the shift should align with
+ * the block size and no rounding should ever be needed.
+ * However it is better the think a block is bad when it
+ * isn't than to think a block is not bad when it is.
+ */
+ s += (1<<bb->shift) - 1;
+ s >>= bb->shift;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+
+ write_seqlock_irq(&bb->lock);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts before 'target' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+ if (a < target)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo) {
+ /* p[lo] is the last range that could overlap the
+ * current range. Earlier ranges could also overlap,
+ * but only this one can overlap the end of the range.
+ */
+ if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+ /* Partial overlap, leave the tail of this range */
+ int ack = BB_ACK(p[lo]);
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t end = a + BB_LEN(p[lo]);
+
+ if (a < s) {
+ /* we need to split this range */
+ if (bb->count >= MD_MAX_BADBLOCKS) {
+ rv = -ENOSPC;
+ goto out;
+ }
+ memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+ bb->count++;
+ p[lo] = BB_MAKE(a, s-a, ack);
+ lo++;
+ }
+ p[lo] = BB_MAKE(target, end - target, ack);
+ /* there is no longer an overlap */
+ hi = lo;
+ lo--;
+ }
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ /* This range does overlap */
+ if (BB_OFFSET(p[lo]) < s) {
+ /* Keep the early parts of this range. */
+ int ack = BB_ACK(p[lo]);
+ sector_t start = BB_OFFSET(p[lo]);
+ p[lo] = BB_MAKE(start, s - start, ack);
+ /* now low doesn't overlap, so.. */
+ break;
+ }
+ lo--;
+ }
+ /* 'lo' is strictly before, 'hi' is strictly after,
+ * anything between needs to be discarded
+ */
+ if (hi - lo > 1) {
+ memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+ bb->count -= (hi - lo - 1);
+ }
+ }
+
+ bb->changed = 1;
+out:
+ write_sequnlock_irq(&bb->lock);
+ return rv;
+}
+
+int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
+{
+ if (is_new)
+ s += rdev->new_data_offset;
+ else
+ s += rdev->data_offset;
+ return md_clear_badblocks(&rdev->badblocks,
+ s, sectors);
+}
+EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
+
+/*
+ * Acknowledge all bad blocks in a list.
+ * This only succeeds if ->changed is clear. It is used by
+ * in-kernel metadata updates
+ */
+void md_ack_all_badblocks(struct badblocks *bb)
+{
+ if (bb->page == NULL || bb->changed)
+ /* no point even trying */
+ return;
+ write_seqlock_irq(&bb->lock);
+
+ if (bb->changed == 0 && bb->unacked_exist) {
+ u64 *p = bb->page;
+ int i;
+ for (i = 0; i < bb->count ; i++) {
+ if (!BB_ACK(p[i])) {
+ sector_t start = BB_OFFSET(p[i]);
+ int len = BB_LEN(p[i]);
+ p[i] = BB_MAKE(start, len, 1);
+ }
+ }
+ bb->unacked_exist = 0;
+ }
+ write_sequnlock_irq(&bb->lock);
+}
+EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
+
+/* sysfs access to bad-blocks list.
+ * We present two files.
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
+ * are recorded as bad. The list is truncated to fit within
+ * the one-page limit of sysfs.
+ * Writing "sector length" to this file adds an acknowledged
+ * bad block list.
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
+ * been acknowledged. Writing to this file adds bad blocks
+ * without acknowledging them. This is largely for testing.
+ */
+
+static ssize_t
+badblocks_show(struct badblocks *bb, char *page, int unack)
+{
+ size_t len;
+ int i;
+ u64 *p = bb->page;
+ unsigned seq;
+
+ if (bb->shift < 0)
+ return 0;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ len = 0;
+ i = 0;
+
+ while (len < PAGE_SIZE && i < bb->count) {
+ sector_t s = BB_OFFSET(p[i]);
+ unsigned int length = BB_LEN(p[i]);
+ int ack = BB_ACK(p[i]);
+ i++;
+
+ if (unack && ack)
+ continue;
+
+ len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
+ (unsigned long long)s << bb->shift,
+ length << bb->shift);
+ }
+ if (unack && len == 0)
+ bb->unacked_exist = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return len;
+}
+
+#define DO_DEBUG 1
+
+static ssize_t
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
+{
+ unsigned long long sector;
+ int length;
+ char newline;
+#ifdef DO_DEBUG
+ /* Allow clearing via sysfs *only* for testing/debugging.
+ * Normally only a successful write may clear a badblock
+ */
+ int clear = 0;
+ if (page[0] == '-') {
+ clear = 1;
+ page++;
+ }
+#endif /* DO_DEBUG */
+
+ switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
+ case 3:
+ if (newline != '\n')
+ return -EINVAL;
+ case 2:
+ if (length <= 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+#ifdef DO_DEBUG
+ if (clear) {
+ md_clear_badblocks(bb, sector, length);
+ return len;
+ }
+#endif /* DO_DEBUG */
+ if (md_set_badblocks(bb, sector, length, !unack))
+ return len;
+ else
+ return -ENOSPC;
+}
+
+static int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct list_head *tmp;
+ struct mddev *mddev;
+ int need_delay = 0;
+
+ for_each_mddev(mddev, tmp) {
+ if (mddev_trylock(mddev)) {
+ if (mddev->pers)
+ __md_stop_writes(mddev);
+ if (mddev->persistent)
+ mddev->safemode = 2;
+ mddev_unlock(mddev);
+ }
+ need_delay = 1;
+ }
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ if (need_delay)
+ mdelay(1000*1);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block md_notifier = {
+ .notifier_call = md_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+ proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
+}
+
+static int __init md_init(void)
+{
+ int ret = -ENOMEM;
+
+ md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
+ if (!md_wq)
+ goto err_wq;
+
+ md_misc_wq = alloc_workqueue("md_misc", 0, 0);
+ if (!md_misc_wq)
+ goto err_misc_wq;
+
+ if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+ goto err_md;
+
+ if ((ret = register_blkdev(0, "mdp")) < 0)
+ goto err_mdp;
+ mdp_major = ret;
+
+ blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
+ md_probe, NULL, NULL);
+ blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
+ md_probe, NULL, NULL);
+
+ register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table);
+
+ md_geninit();
+ return 0;
+
+err_mdp:
+ unregister_blkdev(MD_MAJOR, "md");
+err_md:
+ destroy_workqueue(md_misc_wq);
+err_misc_wq:
+ destroy_workqueue(md_wq);
+err_wq:
+ return ret;
+}
+
+void md_reload_sb(struct mddev *mddev)
+{
+ struct md_rdev *rdev, *tmp;
+
+ rdev_for_each_safe(rdev, tmp, mddev) {
+ rdev->sb_loaded = 0;
+ ClearPageUptodate(rdev->sb_page);
+ }
+ mddev->raid_disks = 0;
+ analyze_sbs(mddev);
+ rdev_for_each_safe(rdev, tmp, mddev) {
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+ /* since we don't write to faulty devices, we figure out if the
+ * disk is faulty by comparing events
+ */
+ if (mddev->events > sb->events)
+ set_bit(Faulty, &rdev->flags);
+ }
+
+}
+EXPORT_SYMBOL(md_reload_sb);
+
+#ifndef MODULE
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+
+static LIST_HEAD(all_detected_devices);
+struct detected_devices_node {
+ struct list_head list;
+ dev_t dev;
+};
+
+void md_autodetect_dev(dev_t dev)
+{
+ struct detected_devices_node *node_detected_dev;
+
+ node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
+ if (node_detected_dev) {
+ node_detected_dev->dev = dev;
+ list_add_tail(&node_detected_dev->list, &all_detected_devices);
+ } else {
+ printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
+ ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
+ }
+}
+
+static void autostart_arrays(int part)
+{
+ struct md_rdev *rdev;
+ struct detected_devices_node *node_detected_dev;
+ dev_t dev;
+ int i_scanned, i_passed;
+
+ i_scanned = 0;
+ i_passed = 0;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
+ i_scanned++;
+ node_detected_dev = list_entry(all_detected_devices.next,
+ struct detected_devices_node, list);
+ list_del(&node_detected_dev->list);
+ dev = node_detected_dev->dev;
+ kfree(node_detected_dev);
+ rdev = md_import_device(dev,0, 90);
+ if (IS_ERR(rdev))
+ continue;
+
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+
+ set_bit(AutoDetected, &rdev->flags);
+ list_add(&rdev->same_set, &pending_raid_disks);
+ i_passed++;
+ }
+
+ printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
+ i_scanned, i_passed);
+
+ autorun_devices(part);
+}
+
+#endif /* !MODULE */
+
+static __exit void md_exit(void)
+{
+ struct mddev *mddev;
+ struct list_head *tmp;
+ int delay = 1;
+
+ blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
+ blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
+
+ unregister_blkdev(MD_MAJOR,"md");
+ unregister_blkdev(mdp_major, "mdp");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+
+ /* We cannot unload the modules while some process is
+ * waiting for us in select() or poll() - wake them up
+ */
+ md_unloading = 1;
+ while (waitqueue_active(&md_event_waiters)) {
+ /* not safe to leave yet */
+ wake_up(&md_event_waiters);
+ msleep(delay);
+ delay += delay;
+ }
+ remove_proc_entry("mdstat", NULL);
+
+ for_each_mddev(mddev, tmp) {
+ export_array(mddev);
+ mddev->hold_active = 0;
+ }
+ destroy_workqueue(md_misc_wq);
+ destroy_workqueue(md_wq);
+}
+
+subsys_initcall(md_init);
+module_exit(md_exit)
+
+static int get_ro(char *buffer, struct kernel_param *kp)
+{
+ return sprintf(buffer, "%d", start_readonly);
+}
+static int set_ro(const char *val, struct kernel_param *kp)
+{
+ char *e;
+ int num = simple_strtoul(val, &e, 10);
+ if (*val && (*e == '\0' || *e == '\n')) {
+ start_readonly = num;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
+module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
+module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MD RAID framework");
+MODULE_ALIAS("md");
+MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
diff --git a/drivers/md/md.h b/drivers/md/md.h
new file mode 100644
index 000000000..4046a6c6f
--- /dev/null
+++ b/drivers/md/md.h
@@ -0,0 +1,696 @@
+/*
+ md.h : kernel internal structure of the Linux MD driver
+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#ifndef _MD_MD_H
+#define _MD_MD_H
+
+#include <linux/blkdev.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include "md-cluster.h"
+
+#define MaxSector (~(sector_t)0)
+
+/* Bad block numbers are stored sorted in a single page.
+ * 64bits is used for each block or extent.
+ * 54 bits are sector number, 9 bits are extent size,
+ * 1 bit is an 'acknowledged' flag.
+ */
+#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
+
+/*
+ * MD's 'extended' device
+ */
+struct md_rdev {
+ struct list_head same_set; /* RAID devices within the same set */
+
+ sector_t sectors; /* Device size (in 512bytes sectors) */
+ struct mddev *mddev; /* RAID array if running */
+ int last_events; /* IO event timestamp */
+
+ /*
+ * If meta_bdev is non-NULL, it means that a separate device is
+ * being used to store the metadata (superblock/bitmap) which
+ * would otherwise be contained on the same device as the data (bdev).
+ */
+ struct block_device *meta_bdev;
+ struct block_device *bdev; /* block device handle */
+
+ struct page *sb_page, *bb_page;
+ int sb_loaded;
+ __u64 sb_events;
+ sector_t data_offset; /* start of data in array */
+ sector_t new_data_offset;/* only relevant while reshaping */
+ sector_t sb_start; /* offset of the super block (in 512byte sectors) */
+ int sb_size; /* bytes in the superblock */
+ int preferred_minor; /* autorun support */
+
+ struct kobject kobj;
+
+ /* A device can be in one of three states based on two flags:
+ * Not working: faulty==1 in_sync==0
+ * Fully working: faulty==0 in_sync==1
+ * Working, but not
+ * in sync with array
+ * faulty==0 in_sync==0
+ *
+ * It can never have faulty==1, in_sync==1
+ * This reduces the burden of testing multiple flags in many cases
+ */
+
+ unsigned long flags; /* bit set of 'enum flag_bits' bits. */
+ wait_queue_head_t blocked_wait;
+
+ int desc_nr; /* descriptor index in the superblock */
+ int raid_disk; /* role of device in array */
+ int new_raid_disk; /* role that the device will have in
+ * the array after a level-change completes.
+ */
+ int saved_raid_disk; /* role that device used to have in the
+ * array and could again if we did a partial
+ * resync from the bitmap
+ */
+ sector_t recovery_offset;/* If this device has been partially
+ * recovered, this is where we were
+ * up to.
+ */
+
+ atomic_t nr_pending; /* number of pending requests.
+ * only maintained for arrays that
+ * support hot removal
+ */
+ atomic_t read_errors; /* number of consecutive read errors that
+ * we have tried to ignore.
+ */
+ struct timespec last_read_error; /* monotonic time since our
+ * last read error
+ */
+ atomic_t corrected_errors; /* number of corrected read errors,
+ * for reporting to userspace and storing
+ * in superblock.
+ */
+ struct work_struct del_work; /* used for delayed sysfs removal */
+
+ struct kernfs_node *sysfs_state; /* handle for 'state'
+ * sysfs entry */
+
+ struct badblocks {
+ int count; /* count of bad blocks */
+ int unacked_exist; /* there probably are unacknowledged
+ * bad blocks. This is only cleared
+ * when a read discovers none
+ */
+ int shift; /* shift from sectors to block size
+ * a -ve shift means badblocks are
+ * disabled.*/
+ u64 *page; /* badblock list */
+ int changed;
+ seqlock_t lock;
+
+ sector_t sector;
+ sector_t size; /* in sectors */
+ } badblocks;
+};
+enum flag_bits {
+ Faulty, /* device is known to have a fault */
+ In_sync, /* device is in_sync with rest of array */
+ Bitmap_sync, /* ..actually, not quite In_sync. Need a
+ * bitmap-based recovery to get fully in sync
+ */
+ Unmerged, /* device is being added to array and should
+ * be considerred for bvec_merge_fn but not
+ * yet for actual IO
+ */
+ WriteMostly, /* Avoid reading if at all possible */
+ AutoDetected, /* added by auto-detect */
+ Blocked, /* An error occurred but has not yet
+ * been acknowledged by the metadata
+ * handler, so don't allow writes
+ * until it is cleared */
+ WriteErrorSeen, /* A write error has been seen on this
+ * device
+ */
+ FaultRecorded, /* Intermediate state for clearing
+ * Blocked. The Fault is/will-be
+ * recorded in the metadata, but that
+ * metadata hasn't been stored safely
+ * on disk yet.
+ */
+ BlockedBadBlocks, /* A writer is blocked because they
+ * found an unacknowledged bad-block.
+ * This can safely be cleared at any
+ * time, and the writer will re-check.
+ * It may be set at any time, and at
+ * worst the writer will timeout and
+ * re-check. So setting it as
+ * accurately as possible is good, but
+ * not absolutely critical.
+ */
+ WantReplacement, /* This device is a candidate to be
+ * hot-replaced, either because it has
+ * reported some faults, or because
+ * of explicit request.
+ */
+ Replacement, /* This device is a replacement for
+ * a want_replacement device with same
+ * raid_disk number.
+ */
+ Candidate, /* For clustered environments only:
+ * This device is seen locally but not
+ * by the whole cluster
+ */
+};
+
+#define BB_LEN_MASK (0x00000000000001FFULL)
+#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
+#define BB_ACK_MASK (0x8000000000000000ULL)
+#define BB_MAX_LEN 512
+#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
+#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
+#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
+#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
+
+extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors);
+static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ if (unlikely(rdev->badblocks.count)) {
+ int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
+ sectors,
+ first_bad, bad_sectors);
+ if (rv)
+ *first_bad -= rdev->data_offset;
+ return rv;
+ }
+ return 0;
+}
+extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
+extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
+extern void md_ack_all_badblocks(struct badblocks *bb);
+
+struct md_cluster_info;
+
+struct mddev {
+ void *private;
+ struct md_personality *pers;
+ dev_t unit;
+ int md_minor;
+ struct list_head disks;
+ unsigned long flags;
+#define MD_CHANGE_DEVS 0 /* Some device status has changed */
+#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
+#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
+#define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */
+#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */
+#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
+ * md_ioctl checked on it.
+ */
+
+ int suspended;
+ atomic_t active_io;
+ int ro;
+ int sysfs_active; /* set when sysfs deletes
+ * are happening, so run/
+ * takeover/stop are not safe
+ */
+ int ready; /* See when safe to pass
+ * IO requests down */
+ struct gendisk *gendisk;
+
+ struct kobject kobj;
+ int hold_active;
+#define UNTIL_IOCTL 1
+#define UNTIL_STOP 2
+
+ /* Superblock information */
+ int major_version,
+ minor_version,
+ patch_version;
+ int persistent;
+ int external; /* metadata is
+ * managed externally */
+ char metadata_type[17]; /* externally set*/
+ int chunk_sectors;
+ time_t ctime, utime;
+ int level, layout;
+ char clevel[16];
+ int raid_disks;
+ int max_disks;
+ sector_t dev_sectors; /* used size of
+ * component devices */
+ sector_t array_sectors; /* exported array size */
+ int external_size; /* size managed
+ * externally */
+ __u64 events;
+ /* If the last 'event' was simply a clean->dirty transition, and
+ * we didn't write it to the spares, then it is safe and simple
+ * to just decrement the event count on a dirty->clean transition.
+ * So we record that possibility here.
+ */
+ int can_decrease_events;
+
+ char uuid[16];
+
+ /* If the array is being reshaped, we need to record the
+ * new shape and an indication of where we are up to.
+ * This is written to the superblock.
+ * If reshape_position is MaxSector, then no reshape is happening (yet).
+ */
+ sector_t reshape_position;
+ int delta_disks, new_level, new_layout;
+ int new_chunk_sectors;
+ int reshape_backwards;
+
+ struct md_thread *thread; /* management thread */
+ struct md_thread *sync_thread; /* doing resync or reconstruct */
+
+ /* 'last_sync_action' is initialized to "none". It is set when a
+ * sync operation (i.e "data-check", "requested-resync", "resync",
+ * "recovery", or "reshape") is started. It holds this value even
+ * when the sync thread is "frozen" (interrupted) or "idle" (stopped
+ * or finished). It is overwritten when a new sync operation is begun.
+ */
+ char *last_sync_action;
+ sector_t curr_resync; /* last block scheduled */
+ /* As resync requests can complete out of order, we cannot easily track
+ * how much resync has been completed. So we occasionally pause until
+ * everything completes, then set curr_resync_completed to curr_resync.
+ * As such it may be well behind the real resync mark, but it is a value
+ * we are certain of.
+ */
+ sector_t curr_resync_completed;
+ unsigned long resync_mark; /* a recent timestamp */
+ sector_t resync_mark_cnt;/* blocks written at resync_mark */
+ sector_t curr_mark_cnt; /* blocks scheduled now */
+
+ sector_t resync_max_sectors; /* may be set by personality */
+
+ atomic64_t resync_mismatches; /* count of sectors where
+ * parity/replica mismatch found
+ */
+
+ /* allow user-space to request suspension of IO to regions of the array */
+ sector_t suspend_lo;
+ sector_t suspend_hi;
+ /* if zero, use the system-wide default */
+ int sync_speed_min;
+ int sync_speed_max;
+
+ /* resync even though the same disks are shared among md-devices */
+ int parallel_resync;
+
+ int ok_start_degraded;
+ /* recovery/resync flags
+ * NEEDED: we might need to start a resync/recover
+ * RUNNING: a thread is running, or about to be started
+ * SYNC: actually doing a resync, not a recovery
+ * RECOVER: doing recovery, or need to try it.
+ * INTR: resync needs to be aborted for some reason
+ * DONE: thread is done and is waiting to be reaped
+ * REQUEST: user-space has requested a sync (used with SYNC)
+ * CHECK: user-space request for check-only, no repair
+ * RESHAPE: A reshape is happening
+ * ERROR: sync-action interrupted because io-error
+ *
+ * If neither SYNC or RESHAPE are set, then it is a recovery.
+ */
+#define MD_RECOVERY_RUNNING 0
+#define MD_RECOVERY_SYNC 1
+#define MD_RECOVERY_RECOVER 2
+#define MD_RECOVERY_INTR 3
+#define MD_RECOVERY_DONE 4
+#define MD_RECOVERY_NEEDED 5
+#define MD_RECOVERY_REQUESTED 6
+#define MD_RECOVERY_CHECK 7
+#define MD_RECOVERY_RESHAPE 8
+#define MD_RECOVERY_FROZEN 9
+#define MD_RECOVERY_ERROR 10
+
+ unsigned long recovery;
+ /* If a RAID personality determines that recovery (of a particular
+ * device) will fail due to a read error on the source device, it
+ * takes a copy of this number and does not attempt recovery again
+ * until this number changes.
+ */
+ int recovery_disabled;
+
+ int in_sync; /* know to not need resync */
+ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
+ * that we are never stopping an array while it is open.
+ * 'reconfig_mutex' protects all other reconfiguration.
+ * These locks are separate due to conflicting interactions
+ * with bdev->bd_mutex.
+ * Lock ordering is:
+ * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+ * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
+ */
+ struct mutex open_mutex;
+ struct mutex reconfig_mutex;
+ atomic_t active; /* general refcount */
+ atomic_t openers; /* number of active opens */
+
+ int changed; /* True if we might need to
+ * reread partition info */
+ int degraded; /* whether md should consider
+ * adding a spare
+ */
+ int merge_check_needed; /* at least one
+ * member device
+ * has a
+ * merge_bvec_fn */
+
+ atomic_t recovery_active; /* blocks scheduled, but not written */
+ wait_queue_head_t recovery_wait;
+ sector_t recovery_cp;
+ sector_t resync_min; /* user requested sync
+ * starts here */
+ sector_t resync_max; /* resync should pause
+ * when it gets here */
+
+ struct kernfs_node *sysfs_state; /* handle for 'array_state'
+ * file in sysfs.
+ */
+ struct kernfs_node *sysfs_action; /* handle for 'sync_action' */
+
+ struct work_struct del_work; /* used for delayed sysfs removal */
+
+ /* "lock" protects:
+ * flush_bio transition from NULL to !NULL
+ * rdev superblocks, events
+ * clearing MD_CHANGE_*
+ * in_sync - and related safemode and MD_CHANGE changes
+ * pers (also protected by reconfig_mutex and pending IO).
+ * clearing ->bitmap
+ * clearing ->bitmap_info.file
+ * changing ->resync_{min,max}
+ * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
+ */
+ spinlock_t lock;
+ wait_queue_head_t sb_wait; /* for waiting on superblock updates */
+ atomic_t pending_writes; /* number of active superblock writes */
+
+ unsigned int safemode; /* if set, update "clean" superblock
+ * when no writes pending.
+ */
+ unsigned int safemode_delay;
+ struct timer_list safemode_timer;
+ atomic_t writes_pending;
+ struct request_queue *queue; /* for plugging ... */
+
+ struct bitmap *bitmap; /* the bitmap for the device */
+ struct {
+ struct file *file; /* the bitmap file */
+ loff_t offset; /* offset from superblock of
+ * start of bitmap. May be
+ * negative, but not '0'
+ * For external metadata, offset
+ * from start of device.
+ */
+ unsigned long space; /* space available at this offset */
+ loff_t default_offset; /* this is the offset to use when
+ * hot-adding a bitmap. It should
+ * eventually be settable by sysfs.
+ */
+ unsigned long default_space; /* space available at
+ * default offset */
+ struct mutex mutex;
+ unsigned long chunksize;
+ unsigned long daemon_sleep; /* how many jiffies between updates? */
+ unsigned long max_write_behind; /* write-behind mode */
+ int external;
+ int nodes; /* Maximum number of nodes in the cluster */
+ char cluster_name[64]; /* Name of the cluster */
+ } bitmap_info;
+
+ atomic_t max_corr_read_errors; /* max read retries */
+ struct list_head all_mddevs;
+
+ struct attribute_group *to_remove;
+
+ struct bio_set *bio_set;
+
+ /* Generic flush handling.
+ * The last to finish preflush schedules a worker to submit
+ * the rest of the request (without the REQ_FLUSH flag).
+ */
+ struct bio *flush_bio;
+ atomic_t flush_pending;
+ struct work_struct flush_work;
+ struct work_struct event_work; /* used by dm to report failure event */
+ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
+ struct md_cluster_info *cluster_info;
+};
+
+static inline int __must_check mddev_lock(struct mddev *mddev)
+{
+ return mutex_lock_interruptible(&mddev->reconfig_mutex);
+}
+
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev *mddev)
+{
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
+static inline int mddev_is_locked(struct mddev *mddev)
+{
+ return mutex_is_locked(&mddev->reconfig_mutex);
+}
+
+static inline int mddev_trylock(struct mddev *mddev)
+{
+ return mutex_trylock(&mddev->reconfig_mutex);
+}
+extern void mddev_unlock(struct mddev *mddev);
+
+static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
+{
+ atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
+}
+
+struct md_personality
+{
+ char *name;
+ int level;
+ struct list_head list;
+ struct module *owner;
+ void (*make_request)(struct mddev *mddev, struct bio *bio);
+ int (*run)(struct mddev *mddev);
+ void (*free)(struct mddev *mddev, void *priv);
+ void (*status)(struct seq_file *seq, struct mddev *mddev);
+ /* error_handler must set ->faulty and clear ->in_sync
+ * if appropriate, and should abort recovery if needed
+ */
+ void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
+ int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
+ int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
+ int (*spare_active) (struct mddev *mddev);
+ sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped);
+ int (*resize) (struct mddev *mddev, sector_t sectors);
+ sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
+ int (*check_reshape) (struct mddev *mddev);
+ int (*start_reshape) (struct mddev *mddev);
+ void (*finish_reshape) (struct mddev *mddev);
+ /* quiesce moves between quiescence states
+ * 0 - fully active
+ * 1 - no new requests allowed
+ * others - reserved
+ */
+ void (*quiesce) (struct mddev *mddev, int state);
+ /* takeover is used to transition an array from one
+ * personality to another. The new personality must be able
+ * to handle the data in the current layout.
+ * e.g. 2drive raid1 -> 2drive raid5
+ * ndrive raid5 -> degraded n+1drive raid6 with special layout
+ * If the takeover succeeds, a new 'private' structure is returned.
+ * This needs to be installed and then ->run used to activate the
+ * array.
+ */
+ void *(*takeover) (struct mddev *mddev);
+ /* congested implements bdi.congested_fn().
+ * Will not be called while array is 'suspended' */
+ int (*congested)(struct mddev *mddev, int bits);
+ /* mergeable_bvec is use to implement ->merge_bvec_fn */
+ int (*mergeable_bvec)(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec);
+};
+
+struct md_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct mddev *, char *);
+ ssize_t (*store)(struct mddev *, const char *, size_t);
+};
+extern struct attribute_group md_bitmap_group;
+
+static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
+{
+ if (sd)
+ return sysfs_get_dirent(sd, name);
+ return sd;
+}
+static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
+{
+ if (sd)
+ sysfs_notify_dirent(sd);
+}
+
+static inline char * mdname (struct mddev * mddev)
+{
+ return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
+}
+
+static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
+{
+ char nm[20];
+ if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+ } else
+ return 0;
+}
+
+static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
+{
+ char nm[20];
+ if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ }
+}
+
+/*
+ * iterates through some rdev ringlist. It's safe to remove the
+ * current 'rdev'. Dont touch 'tmp' though.
+ */
+#define rdev_for_each_list(rdev, tmp, head) \
+ list_for_each_entry_safe(rdev, tmp, head, same_set)
+
+/*
+ * iterates through the 'same array disks' ringlist
+ */
+#define rdev_for_each(rdev, mddev) \
+ list_for_each_entry(rdev, &((mddev)->disks), same_set)
+
+#define rdev_for_each_safe(rdev, tmp, mddev) \
+ list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
+
+#define rdev_for_each_rcu(rdev, mddev) \
+ list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
+struct md_thread {
+ void (*run) (struct md_thread *thread);
+ struct mddev *mddev;
+ wait_queue_head_t wqueue;
+ unsigned long flags;
+ struct task_struct *tsk;
+ unsigned long timeout;
+ void *private;
+};
+
+#define THREAD_WAKEUP 0
+
+static inline void safe_put_page(struct page *p)
+{
+ if (p) put_page(p);
+}
+
+extern int register_md_personality(struct md_personality *p);
+extern int unregister_md_personality(struct md_personality *p);
+extern int register_md_cluster_operations(struct md_cluster_operations *ops,
+ struct module *module);
+extern int unregister_md_cluster_operations(void);
+extern int md_setup_cluster(struct mddev *mddev, int nodes);
+extern void md_cluster_stop(struct mddev *mddev);
+extern struct md_thread *md_register_thread(
+ void (*run)(struct md_thread *thread),
+ struct mddev *mddev,
+ const char *name);
+extern void md_unregister_thread(struct md_thread **threadp);
+extern void md_wakeup_thread(struct md_thread *thread);
+extern void md_check_recovery(struct mddev *mddev);
+extern void md_reap_sync_thread(struct mddev *mddev);
+extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_write_end(struct mddev *mddev);
+extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
+extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
+extern void md_finish_reshape(struct mddev *mddev);
+
+extern int mddev_congested(struct mddev *mddev, int bits);
+extern void md_flush_request(struct mddev *mddev, struct bio *bio);
+extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
+ sector_t sector, int size, struct page *page);
+extern void md_super_wait(struct mddev *mddev);
+extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
+ struct page *page, int rw, bool metadata_op);
+extern void md_do_sync(struct md_thread *thread);
+extern void md_new_event(struct mddev *mddev);
+extern int md_allow_write(struct mddev *mddev);
+extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
+extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
+extern int md_check_no_bitmap(struct mddev *mddev);
+extern int md_integrity_register(struct mddev *mddev);
+extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
+extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
+
+extern void mddev_init(struct mddev *mddev);
+extern int md_run(struct mddev *mddev);
+extern void md_stop(struct mddev *mddev);
+extern void md_stop_writes(struct mddev *mddev);
+extern int md_rdev_init(struct md_rdev *rdev);
+extern void md_rdev_clear(struct md_rdev *rdev);
+
+extern void mddev_suspend(struct mddev *mddev);
+extern void mddev_resume(struct mddev *mddev);
+extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+ struct mddev *mddev);
+extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+ struct mddev *mddev);
+
+extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
+extern void md_reload_sb(struct mddev *mddev);
+extern void md_update_sb(struct mddev *mddev, int force);
+extern void md_kick_rdev_from_array(struct md_rdev * rdev);
+struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
+static inline int mddev_check_plugged(struct mddev *mddev)
+{
+ return !!blk_check_plugged(md_unplug, mddev,
+ sizeof(struct blk_plug_cb));
+}
+
+static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+{
+ int faulty = test_bit(Faulty, &rdev->flags);
+ if (atomic_dec_and_test(&rdev->nr_pending) && faulty) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+}
+
+extern struct md_cluster_operations *md_cluster_ops;
+static inline int mddev_is_clustered(struct mddev *mddev)
+{
+ return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
+}
+#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
new file mode 100644
index 000000000..ac3ede2bd
--- /dev/null
+++ b/drivers/md/multipath.c
@@ -0,0 +1,544 @@
+/*
+ * multipath.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * MULTIPATH management functions.
+ *
+ * derived from raid1.c.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "md.h"
+#include "multipath.h"
+
+#define MAX_WORK_PER_DISK 128
+
+#define NR_RESERVED_BUFS 32
+
+static int multipath_map (struct mpconf *conf)
+{
+ int i, disks = conf->raid_disks;
+
+ /*
+ * Later we do read balancing on the read side
+ * now we use the first available disk.
+ */
+
+ rcu_read_lock();
+ for (i = 0; i < disks; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
+ if (rdev && test_bit(In_sync, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ return i;
+ }
+ }
+ rcu_read_unlock();
+
+ printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+ return (-1);
+}
+
+static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
+{
+ unsigned long flags;
+ struct mddev *mddev = mp_bh->mddev;
+ struct mpconf *conf = mddev->private;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ list_add(&mp_bh->retry_list, &conf->retry_list);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ md_wakeup_thread(mddev->thread);
+}
+
+/*
+ * multipath_end_bh_io() is called when we have finished servicing a multipathed
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
+{
+ struct bio *bio = mp_bh->master_bio;
+ struct mpconf *conf = mp_bh->mddev->private;
+
+ bio_endio(bio, err);
+ mempool_free(mp_bh, conf->pool);
+}
+
+static void multipath_end_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct multipath_bh *mp_bh = bio->bi_private;
+ struct mpconf *conf = mp_bh->mddev->private;
+ struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
+
+ if (uptodate)
+ multipath_end_bh_io(mp_bh, 0);
+ else if (!(bio->bi_rw & REQ_RAHEAD)) {
+ /*
+ * oops, IO error:
+ */
+ char b[BDEVNAME_SIZE];
+ md_error (mp_bh->mddev, rdev);
+ printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
+ bdevname(rdev->bdev,b),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ multipath_reschedule_retry(mp_bh);
+ } else
+ multipath_end_bh_io(mp_bh, error);
+ rdev_dec_pending(rdev, conf->mddev);
+}
+
+static void multipath_make_request(struct mddev *mddev, struct bio * bio)
+{
+ struct mpconf *conf = mddev->private;
+ struct multipath_bh * mp_bh;
+ struct multipath_info *multipath;
+
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
+ return;
+ }
+
+ mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
+
+ mp_bh->master_bio = bio;
+ mp_bh->mddev = mddev;
+
+ mp_bh->path = multipath_map(conf);
+ if (mp_bh->path < 0) {
+ bio_endio(bio, -EIO);
+ mempool_free(mp_bh, conf->pool);
+ return;
+ }
+ multipath = conf->multipaths + mp_bh->path;
+
+ mp_bh->bio = *bio;
+ mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
+ mp_bh->bio.bi_bdev = multipath->rdev->bdev;
+ mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT;
+ mp_bh->bio.bi_end_io = multipath_end_request;
+ mp_bh->bio.bi_private = mp_bh;
+ generic_make_request(&mp_bh->bio);
+ return;
+}
+
+static void multipath_status (struct seq_file *seq, struct mddev *mddev)
+{
+ struct mpconf *conf = mddev->private;
+ int i;
+
+ seq_printf (seq, " [%d/%d] [", conf->raid_disks,
+ conf->raid_disks - mddev->degraded);
+ for (i = 0; i < conf->raid_disks; i++)
+ seq_printf (seq, "%s",
+ conf->multipaths[i].rdev &&
+ test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_");
+ seq_printf (seq, "]");
+}
+
+static int multipath_congested(struct mddev *mddev, int bits)
+{
+ struct mpconf *conf = mddev->private;
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks ; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ /* Just like multipath_map, we just check the
+ * first available device
+ */
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Careful, this can execute in IRQ contexts as well!
+ */
+static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct mpconf *conf = mddev->private;
+ char b[BDEVNAME_SIZE];
+
+ if (conf->raid_disks - mddev->degraded <= 1) {
+ /*
+ * Uh oh, we can do nothing if this is our last path, but
+ * first check if this is a queued request for a device
+ * which has just failed.
+ */
+ printk(KERN_ALERT
+ "multipath: only one IO path left and IO error.\n");
+ /* leave it active... it's all we have */
+ return;
+ }
+ /*
+ * Mark disk as unusable
+ */
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+ set_bit(Faulty, &rdev->flags);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ printk(KERN_ALERT "multipath: IO failure on %s,"
+ " disabling IO path.\n"
+ "multipath: Operation continuing"
+ " on %d IO paths.\n",
+ bdevname(rdev->bdev, b),
+ conf->raid_disks - mddev->degraded);
+}
+
+static void print_multipath_conf (struct mpconf *conf)
+{
+ int i;
+ struct multipath_info *tmp;
+
+ printk("MULTIPATH conf printout:\n");
+ if (!conf) {
+ printk("(conf==NULL)\n");
+ return;
+ }
+ printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+ conf->raid_disks);
+
+ for (i = 0; i < conf->raid_disks; i++) {
+ char b[BDEVNAME_SIZE];
+ tmp = conf->multipaths + i;
+ if (tmp->rdev)
+ printk(" disk%d, o:%d, dev:%s\n",
+ i,!test_bit(Faulty, &tmp->rdev->flags),
+ bdevname(tmp->rdev->bdev,b));
+ }
+}
+
+static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct mpconf *conf = mddev->private;
+ struct request_queue *q;
+ int err = -EEXIST;
+ int path;
+ struct multipath_info *p;
+ int first = 0;
+ int last = mddev->raid_disks - 1;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
+
+ print_multipath_conf(conf);
+
+ for (path = first; path <= last; path++)
+ if ((p=conf->multipaths+path)->rdev == NULL) {
+ q = rdev->bdev->bd_disk->queue;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+
+ /* as we don't honour merge_bvec_fn, we must never risk
+ * violating it, so limit ->max_segments to one, lying
+ * within a single page.
+ * (Note: it is very unlikely that a device with
+ * merge_bvec_fn will be involved in multipath.)
+ */
+ if (q->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
+
+ spin_lock_irq(&conf->device_lock);
+ mddev->degraded--;
+ rdev->raid_disk = path;
+ set_bit(In_sync, &rdev->flags);
+ spin_unlock_irq(&conf->device_lock);
+ rcu_assign_pointer(p->rdev, rdev);
+ err = 0;
+ md_integrity_add_rdev(rdev, mddev);
+ break;
+ }
+
+ print_multipath_conf(conf);
+
+ return err;
+}
+
+static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct mpconf *conf = mddev->private;
+ int err = 0;
+ int number = rdev->raid_disk;
+ struct multipath_info *p = conf->multipaths + number;
+
+ print_multipath_conf(conf);
+
+ if (rdev == p->rdev) {
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ printk(KERN_ERR "hot-remove-disk, slot %d is identified"
+ " but is still operational!\n", number);
+ err = -EBUSY;
+ goto abort;
+ }
+ p->rdev = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ p->rdev = rdev;
+ goto abort;
+ }
+ err = md_integrity_register(mddev);
+ }
+abort:
+
+ print_multipath_conf(conf);
+ return err;
+}
+
+/*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working multipaths.
+ * 2. Updates the raid superblock when problems encounter.
+ * 3. Performs writes following reads for array syncronising.
+ */
+
+static void multipathd(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct multipath_bh *mp_bh;
+ struct bio *bio;
+ unsigned long flags;
+ struct mpconf *conf = mddev->private;
+ struct list_head *head = &conf->retry_list;
+
+ md_check_recovery(mddev);
+ for (;;) {
+ char b[BDEVNAME_SIZE];
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (list_empty(head))
+ break;
+ mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
+ list_del(head->prev);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ bio = &mp_bh->bio;
+ bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
+
+ if ((mp_bh->path = multipath_map (conf))<0) {
+ printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
+ " error for block %llu\n",
+ bdevname(bio->bi_bdev,b),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ multipath_end_bh_io(mp_bh, -EIO);
+ } else {
+ printk(KERN_ERR "multipath: %s: redirecting sector %llu"
+ " to another IO path\n",
+ bdevname(bio->bi_bdev,b),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ *bio = *(mp_bh->master_bio);
+ bio->bi_iter.bi_sector +=
+ conf->multipaths[mp_bh->path].rdev->data_offset;
+ bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
+ bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
+ bio->bi_end_io = multipath_end_request;
+ bio->bi_private = mp_bh;
+ generic_make_request(bio);
+ }
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+
+static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+
+ return mddev->dev_sectors;
+}
+
+static int multipath_run (struct mddev *mddev)
+{
+ struct mpconf *conf;
+ int disk_idx;
+ struct multipath_info *disk;
+ struct md_rdev *rdev;
+ int working_disks;
+
+ if (md_check_no_bitmap(mddev))
+ return -EINVAL;
+
+ if (mddev->level != LEVEL_MULTIPATH) {
+ printk("multipath: %s: raid level not set to multipath IO (%d)\n",
+ mdname(mddev), mddev->level);
+ goto out;
+ }
+ /*
+ * copy the already verified devices into our private MULTIPATH
+ * bookkeeping area. [whatever we allocate in multipath_run(),
+ * should be freed in multipath_free()]
+ */
+
+ conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
+ mddev->private = conf;
+ if (!conf) {
+ printk(KERN_ERR
+ "multipath: couldn't allocate memory for %s\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
+ GFP_KERNEL);
+ if (!conf->multipaths) {
+ printk(KERN_ERR
+ "multipath: couldn't allocate memory for %s\n",
+ mdname(mddev));
+ goto out_free_conf;
+ }
+
+ working_disks = 0;
+ rdev_for_each(rdev, mddev) {
+ disk_idx = rdev->raid_disk;
+ if (disk_idx < 0 ||
+ disk_idx >= mddev->raid_disks)
+ continue;
+
+ disk = conf->multipaths + disk_idx;
+ disk->rdev = rdev;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+
+ /* as we don't honour merge_bvec_fn, we must never risk
+ * violating it, not that we ever expect a device with
+ * a merge_bvec_fn to be involved in multipath */
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
+
+ if (!test_bit(Faulty, &rdev->flags))
+ working_disks++;
+ }
+
+ conf->raid_disks = mddev->raid_disks;
+ conf->mddev = mddev;
+ spin_lock_init(&conf->device_lock);
+ INIT_LIST_HEAD(&conf->retry_list);
+
+ if (!working_disks) {
+ printk(KERN_ERR "multipath: no operational IO paths for %s\n",
+ mdname(mddev));
+ goto out_free_conf;
+ }
+ mddev->degraded = conf->raid_disks - working_disks;
+
+ conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
+ sizeof(struct multipath_bh));
+ if (conf->pool == NULL) {
+ printk(KERN_ERR
+ "multipath: couldn't allocate memory for %s\n",
+ mdname(mddev));
+ goto out_free_conf;
+ }
+
+ {
+ mddev->thread = md_register_thread(multipathd, mddev,
+ "multipath");
+ if (!mddev->thread) {
+ printk(KERN_ERR "multipath: couldn't allocate thread"
+ " for %s\n", mdname(mddev));
+ goto out_free_conf;
+ }
+ }
+
+ printk(KERN_INFO
+ "multipath: array %s active with %d out of %d IO paths\n",
+ mdname(mddev), conf->raid_disks - mddev->degraded,
+ mddev->raid_disks);
+ /*
+ * Ok, everything is just fine now
+ */
+ md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
+
+ if (md_integrity_register(mddev))
+ goto out_free_conf;
+
+ return 0;
+
+out_free_conf:
+ if (conf->pool)
+ mempool_destroy(conf->pool);
+ kfree(conf->multipaths);
+ kfree(conf);
+ mddev->private = NULL;
+out:
+ return -EIO;
+}
+
+static void multipath_free(struct mddev *mddev, void *priv)
+{
+ struct mpconf *conf = priv;
+
+ mempool_destroy(conf->pool);
+ kfree(conf->multipaths);
+ kfree(conf);
+}
+
+static struct md_personality multipath_personality =
+{
+ .name = "multipath",
+ .level = LEVEL_MULTIPATH,
+ .owner = THIS_MODULE,
+ .make_request = multipath_make_request,
+ .run = multipath_run,
+ .free = multipath_free,
+ .status = multipath_status,
+ .error_handler = multipath_error,
+ .hot_add_disk = multipath_add_disk,
+ .hot_remove_disk= multipath_remove_disk,
+ .size = multipath_size,
+ .congested = multipath_congested,
+};
+
+static int __init multipath_init (void)
+{
+ return register_md_personality (&multipath_personality);
+}
+
+static void __exit multipath_exit (void)
+{
+ unregister_md_personality (&multipath_personality);
+}
+
+module_init(multipath_init);
+module_exit(multipath_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("simple multi-path personality for MD");
+MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
+MODULE_ALIAS("md-multipath");
+MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
new file mode 100644
index 000000000..717c60f62
--- /dev/null
+++ b/drivers/md/multipath.h
@@ -0,0 +1,31 @@
+#ifndef _MULTIPATH_H
+#define _MULTIPATH_H
+
+struct multipath_info {
+ struct md_rdev *rdev;
+};
+
+struct mpconf {
+ struct mddev *mddev;
+ struct multipath_info *multipaths;
+ int raid_disks;
+ spinlock_t device_lock;
+ struct list_head retry_list;
+
+ mempool_t *pool;
+};
+
+/*
+ * this is our 'private' 'collective' MULTIPATH buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this MULTIPATH operation, and about their status:
+ */
+
+struct multipath_bh {
+ struct mddev *mddev;
+ struct bio *master_bio;
+ struct bio bio;
+ int path;
+ struct list_head retry_list;
+};
+#endif
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
new file mode 100644
index 000000000..78c74bb71
--- /dev/null
+++ b/drivers/md/persistent-data/Kconfig
@@ -0,0 +1,18 @@
+config DM_PERSISTENT_DATA
+ tristate
+ depends on BLK_DEV_DM
+ select LIBCRC32C
+ select DM_BUFIO
+ ---help---
+ Library providing immutable on-disk data structure support for
+ device-mapper targets such as the thin provisioning target.
+
+config DM_DEBUG_BLOCK_STACK_TRACING
+ bool "Keep stack trace of persistent data block lock holders"
+ depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
+ select STACKTRACE
+ ---help---
+ Enable this for messages that may help debug problems with the
+ block manager locking used by thin provisioning and caching.
+
+ If unsure, say N.
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
new file mode 100644
index 000000000..ff528792c
--- /dev/null
+++ b/drivers/md/persistent-data/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
+dm-persistent-data-objs := \
+ dm-array.o \
+ dm-bitset.o \
+ dm-block-manager.o \
+ dm-space-map-common.o \
+ dm-space-map-disk.o \
+ dm-space-map-metadata.o \
+ dm-transaction-manager.o \
+ dm-btree.o \
+ dm-btree-remove.o \
+ dm-btree-spine.o
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
new file mode 100644
index 000000000..e64b61ad0
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.c
@@ -0,0 +1,821 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-array.h"
+#include "dm-space-map.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "array"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The array is implemented as a fully populated btree, which points to
+ * blocks that contain the packed values. This is more space efficient
+ * than just using a btree since we don't store 1 key per value.
+ */
+struct array_block {
+ __le32 csum;
+ __le32 max_entries;
+ __le32 nr_entries;
+ __le32 value_size;
+ __le64 blocknr; /* Block this node is supposed to live in. */
+} __packed;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Validator methods. As usual we calculate a checksum, and also write the
+ * block location into the header (paranoia about ssds remapping areas by
+ * mistake).
+ */
+#define CSUM_XOR 595846735
+
+static void array_block_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t size_of_block)
+{
+ struct array_block *bh_le = dm_block_data(b);
+
+ bh_le->blocknr = cpu_to_le64(dm_block_location(b));
+ bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
+ size_of_block - sizeof(__le32),
+ CSUM_XOR));
+}
+
+static int array_block_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t size_of_block)
+{
+ struct array_block *bh_le = dm_block_data(b);
+ __le32 csum_disk;
+
+ if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) {
+ DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu",
+ (unsigned long long) le64_to_cpu(bh_le->blocknr),
+ (unsigned long long) dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries,
+ size_of_block - sizeof(__le32),
+ CSUM_XOR));
+ if (csum_disk != bh_le->csum) {
+ DMERR_LIMIT("array_block_check failed: csum %u != wanted %u",
+ (unsigned) le32_to_cpu(csum_disk),
+ (unsigned) le32_to_cpu(bh_le->csum));
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+static struct dm_block_validator array_validator = {
+ .name = "array",
+ .prepare_for_write = array_block_prepare_for_write,
+ .check = array_block_check
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Functions for manipulating the array blocks.
+ */
+
+/*
+ * Returns a pointer to a value within an array block.
+ *
+ * index - The index into _this_ specific block.
+ */
+static void *element_at(struct dm_array_info *info, struct array_block *ab,
+ unsigned index)
+{
+ unsigned char *entry = (unsigned char *) (ab + 1);
+
+ entry += index * info->value_type.size;
+
+ return entry;
+}
+
+/*
+ * Utility function that calls one of the value_type methods on every value
+ * in an array block.
+ */
+static void on_entries(struct dm_array_info *info, struct array_block *ab,
+ void (*fn)(void *, const void *))
+{
+ unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
+
+ for (i = 0; i < nr_entries; i++)
+ fn(info->value_type.context, element_at(info, ab, i));
+}
+
+/*
+ * Increment every value in an array block.
+ */
+static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab)
+{
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ if (vt->inc)
+ on_entries(info, ab, vt->inc);
+}
+
+/*
+ * Decrement every value in an array block.
+ */
+static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab)
+{
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ if (vt->dec)
+ on_entries(info, ab, vt->dec);
+}
+
+/*
+ * Each array block can hold this many values.
+ */
+static uint32_t calc_max_entries(size_t value_size, size_t size_of_block)
+{
+ return (size_of_block - sizeof(struct array_block)) / value_size;
+}
+
+/*
+ * Allocate a new array block. The caller will need to unlock block.
+ */
+static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
+ uint32_t max_entries,
+ struct dm_block **block, struct array_block **ab)
+{
+ int r;
+
+ r = dm_tm_new_block(info->btree_info.tm, &array_validator, block);
+ if (r)
+ return r;
+
+ (*ab) = dm_block_data(*block);
+ (*ab)->max_entries = cpu_to_le32(max_entries);
+ (*ab)->nr_entries = cpu_to_le32(0);
+ (*ab)->value_size = cpu_to_le32(info->value_type.size);
+
+ return 0;
+}
+
+/*
+ * Pad an array block out with a particular value. Every instance will
+ * cause an increment of the value_type. new_nr must always be more than
+ * the current number of entries.
+ */
+static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
+ const void *value, unsigned new_nr)
+{
+ unsigned i;
+ uint32_t nr_entries;
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+ BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
+
+ nr_entries = le32_to_cpu(ab->nr_entries);
+ for (i = nr_entries; i < new_nr; i++) {
+ if (vt->inc)
+ vt->inc(vt->context, value);
+ memcpy(element_at(info, ab, i), value, vt->size);
+ }
+ ab->nr_entries = cpu_to_le32(new_nr);
+}
+
+/*
+ * Remove some entries from the back of an array block. Every value
+ * removed will be decremented. new_nr must be <= the current number of
+ * entries.
+ */
+static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
+ unsigned new_nr)
+{
+ unsigned i;
+ uint32_t nr_entries;
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+ BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
+
+ nr_entries = le32_to_cpu(ab->nr_entries);
+ for (i = nr_entries; i > new_nr; i--)
+ if (vt->dec)
+ vt->dec(vt->context, element_at(info, ab, i - 1));
+ ab->nr_entries = cpu_to_le32(new_nr);
+}
+
+/*
+ * Read locks a block, and coerces it to an array block. The caller must
+ * unlock 'block' when finished.
+ */
+static int get_ablock(struct dm_array_info *info, dm_block_t b,
+ struct dm_block **block, struct array_block **ab)
+{
+ int r;
+
+ r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block);
+ if (r)
+ return r;
+
+ *ab = dm_block_data(*block);
+ return 0;
+}
+
+/*
+ * Unlocks an array block.
+ */
+static int unlock_ablock(struct dm_array_info *info, struct dm_block *block)
+{
+ return dm_tm_unlock(info->btree_info.tm, block);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Btree manipulation.
+ */
+
+/*
+ * Looks up an array block in the btree, and then read locks it.
+ *
+ * index is the index of the index of the array_block, (ie. the array index
+ * / max_entries).
+ */
+static int lookup_ablock(struct dm_array_info *info, dm_block_t root,
+ unsigned index, struct dm_block **block,
+ struct array_block **ab)
+{
+ int r;
+ uint64_t key = index;
+ __le64 block_le;
+
+ r = dm_btree_lookup(&info->btree_info, root, &key, &block_le);
+ if (r)
+ return r;
+
+ return get_ablock(info, le64_to_cpu(block_le), block, ab);
+}
+
+/*
+ * Insert an array block into the btree. The block is _not_ unlocked.
+ */
+static int insert_ablock(struct dm_array_info *info, uint64_t index,
+ struct dm_block *block, dm_block_t *root)
+{
+ __le64 block_le = cpu_to_le64(dm_block_location(block));
+
+ __dm_bless_for_disk(block_le);
+ return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root);
+}
+
+/*
+ * Looks up an array block in the btree. Then shadows it, and updates the
+ * btree to point to this new shadow. 'root' is an input/output parameter
+ * for both the current root block, and the new one.
+ */
+static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
+ unsigned index, struct dm_block **block,
+ struct array_block **ab)
+{
+ int r, inc;
+ uint64_t key = index;
+ dm_block_t b;
+ __le64 block_le;
+
+ /*
+ * lookup
+ */
+ r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le);
+ if (r)
+ return r;
+ b = le64_to_cpu(block_le);
+
+ /*
+ * shadow
+ */
+ r = dm_tm_shadow_block(info->btree_info.tm, b,
+ &array_validator, block, &inc);
+ if (r)
+ return r;
+
+ *ab = dm_block_data(*block);
+ if (inc)
+ inc_ablock_entries(info, *ab);
+
+ /*
+ * Reinsert.
+ *
+ * The shadow op will often be a noop. Only insert if it really
+ * copied data.
+ */
+ if (dm_block_location(*block) != b) {
+ /*
+ * dm_tm_shadow_block will have already decremented the old
+ * block, but it is still referenced by the btree. We
+ * increment to stop the insert decrementing it below zero
+ * when overwriting the old value.
+ */
+ dm_tm_inc(info->btree_info.tm, b);
+ r = insert_ablock(info, index, *block, root);
+ }
+
+ return r;
+}
+
+/*
+ * Allocate an new array block, and fill it with some values.
+ */
+static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block,
+ uint32_t max_entries,
+ unsigned block_index, uint32_t nr,
+ const void *value, dm_block_t *root)
+{
+ int r;
+ struct dm_block *block;
+ struct array_block *ab;
+
+ r = alloc_ablock(info, size_of_block, max_entries, &block, &ab);
+ if (r)
+ return r;
+
+ fill_ablock(info, ab, value, nr);
+ r = insert_ablock(info, block_index, block, root);
+ unlock_ablock(info, block);
+
+ return r;
+}
+
+static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block,
+ unsigned begin_block, unsigned end_block,
+ unsigned max_entries, const void *value,
+ dm_block_t *root)
+{
+ int r = 0;
+
+ for (; !r && begin_block != end_block; begin_block++)
+ r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root);
+
+ return r;
+}
+
+/*
+ * There are a bunch of functions involved with resizing an array. This
+ * structure holds information that commonly needed by them. Purely here
+ * to reduce parameter count.
+ */
+struct resize {
+ /*
+ * Describes the array.
+ */
+ struct dm_array_info *info;
+
+ /*
+ * The current root of the array. This gets updated.
+ */
+ dm_block_t root;
+
+ /*
+ * Metadata block size. Used to calculate the nr entries in an
+ * array block.
+ */
+ size_t size_of_block;
+
+ /*
+ * Maximum nr entries in an array block.
+ */
+ unsigned max_entries;
+
+ /*
+ * nr of completely full blocks in the array.
+ *
+ * 'old' refers to before the resize, 'new' after.
+ */
+ unsigned old_nr_full_blocks, new_nr_full_blocks;
+
+ /*
+ * Number of entries in the final block. 0 iff only full blocks in
+ * the array.
+ */
+ unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block;
+
+ /*
+ * The default value used when growing the array.
+ */
+ const void *value;
+};
+
+/*
+ * Removes a consecutive set of array blocks from the btree. The values
+ * in block are decremented as a side effect of the btree remove.
+ *
+ * begin_index - the index of the first array block to remove.
+ * end_index - the one-past-the-end value. ie. this block is not removed.
+ */
+static int drop_blocks(struct resize *resize, unsigned begin_index,
+ unsigned end_index)
+{
+ int r;
+
+ while (begin_index != end_index) {
+ uint64_t key = begin_index++;
+ r = dm_btree_remove(&resize->info->btree_info, resize->root,
+ &key, &resize->root);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+/*
+ * Calculates how many blocks are needed for the array.
+ */
+static unsigned total_nr_blocks_needed(unsigned nr_full_blocks,
+ unsigned nr_entries_in_last_block)
+{
+ return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0);
+}
+
+/*
+ * Shrink an array.
+ */
+static int shrink(struct resize *resize)
+{
+ int r;
+ unsigned begin, end;
+ struct dm_block *block;
+ struct array_block *ab;
+
+ /*
+ * Lose some blocks from the back?
+ */
+ if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) {
+ begin = total_nr_blocks_needed(resize->new_nr_full_blocks,
+ resize->new_nr_entries_in_last_block);
+ end = total_nr_blocks_needed(resize->old_nr_full_blocks,
+ resize->old_nr_entries_in_last_block);
+
+ r = drop_blocks(resize, begin, end);
+ if (r)
+ return r;
+ }
+
+ /*
+ * Trim the new tail block
+ */
+ if (resize->new_nr_entries_in_last_block) {
+ r = shadow_ablock(resize->info, &resize->root,
+ resize->new_nr_full_blocks, &block, &ab);
+ if (r)
+ return r;
+
+ trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block);
+ unlock_ablock(resize->info, block);
+ }
+
+ return 0;
+}
+
+/*
+ * Grow an array.
+ */
+static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries)
+{
+ int r;
+ struct dm_block *block;
+ struct array_block *ab;
+
+ r = shadow_ablock(resize->info, &resize->root,
+ resize->old_nr_full_blocks, &block, &ab);
+ if (r)
+ return r;
+
+ fill_ablock(resize->info, ab, resize->value, new_nr_entries);
+ unlock_ablock(resize->info, block);
+
+ return r;
+}
+
+static int grow_add_tail_block(struct resize *resize)
+{
+ return insert_new_ablock(resize->info, resize->size_of_block,
+ resize->max_entries,
+ resize->new_nr_full_blocks,
+ resize->new_nr_entries_in_last_block,
+ resize->value, &resize->root);
+}
+
+static int grow_needs_more_blocks(struct resize *resize)
+{
+ int r;
+ unsigned old_nr_blocks = resize->old_nr_full_blocks;
+
+ if (resize->old_nr_entries_in_last_block > 0) {
+ old_nr_blocks++;
+
+ r = grow_extend_tail_block(resize, resize->max_entries);
+ if (r)
+ return r;
+ }
+
+ r = insert_full_ablocks(resize->info, resize->size_of_block,
+ old_nr_blocks,
+ resize->new_nr_full_blocks,
+ resize->max_entries, resize->value,
+ &resize->root);
+ if (r)
+ return r;
+
+ if (resize->new_nr_entries_in_last_block)
+ r = grow_add_tail_block(resize);
+
+ return r;
+}
+
+static int grow(struct resize *resize)
+{
+ if (resize->new_nr_full_blocks > resize->old_nr_full_blocks)
+ return grow_needs_more_blocks(resize);
+
+ else if (resize->old_nr_entries_in_last_block)
+ return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block);
+
+ else
+ return grow_add_tail_block(resize);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These are the value_type functions for the btree elements, which point
+ * to array blocks.
+ */
+static void block_inc(void *context, const void *value)
+{
+ __le64 block_le;
+ struct dm_array_info *info = context;
+
+ memcpy(&block_le, value, sizeof(block_le));
+ dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
+}
+
+static void block_dec(void *context, const void *value)
+{
+ int r;
+ uint64_t b;
+ __le64 block_le;
+ uint32_t ref_count;
+ struct dm_block *block;
+ struct array_block *ab;
+ struct dm_array_info *info = context;
+
+ memcpy(&block_le, value, sizeof(block_le));
+ b = le64_to_cpu(block_le);
+
+ r = dm_tm_ref(info->btree_info.tm, b, &ref_count);
+ if (r) {
+ DMERR_LIMIT("couldn't get reference count for block %llu",
+ (unsigned long long) b);
+ return;
+ }
+
+ if (ref_count == 1) {
+ /*
+ * We're about to drop the last reference to this ablock.
+ * So we need to decrement the ref count of the contents.
+ */
+ r = get_ablock(info, b, &block, &ab);
+ if (r) {
+ DMERR_LIMIT("couldn't get array block %llu",
+ (unsigned long long) b);
+ return;
+ }
+
+ dec_ablock_entries(info, ab);
+ unlock_ablock(info, block);
+ }
+
+ dm_tm_dec(info->btree_info.tm, b);
+}
+
+static int block_equal(void *context, const void *value1, const void *value2)
+{
+ return !memcmp(value1, value2, sizeof(__le64));
+}
+
+/*----------------------------------------------------------------*/
+
+void dm_array_info_init(struct dm_array_info *info,
+ struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt)
+{
+ struct dm_btree_value_type *bvt = &info->btree_info.value_type;
+
+ memcpy(&info->value_type, vt, sizeof(info->value_type));
+ info->btree_info.tm = tm;
+ info->btree_info.levels = 1;
+
+ bvt->context = info;
+ bvt->size = sizeof(__le64);
+ bvt->inc = block_inc;
+ bvt->dec = block_dec;
+ bvt->equal = block_equal;
+}
+EXPORT_SYMBOL_GPL(dm_array_info_init);
+
+int dm_array_empty(struct dm_array_info *info, dm_block_t *root)
+{
+ return dm_btree_empty(&info->btree_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_array_empty);
+
+static int array_resize(struct dm_array_info *info, dm_block_t root,
+ uint32_t old_size, uint32_t new_size,
+ const void *value, dm_block_t *new_root)
+{
+ int r;
+ struct resize resize;
+
+ if (old_size == new_size) {
+ *new_root = root;
+ return 0;
+ }
+
+ resize.info = info;
+ resize.root = root;
+ resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+ resize.max_entries = calc_max_entries(info->value_type.size,
+ resize.size_of_block);
+
+ resize.old_nr_full_blocks = old_size / resize.max_entries;
+ resize.old_nr_entries_in_last_block = old_size % resize.max_entries;
+ resize.new_nr_full_blocks = new_size / resize.max_entries;
+ resize.new_nr_entries_in_last_block = new_size % resize.max_entries;
+ resize.value = value;
+
+ r = ((new_size > old_size) ? grow : shrink)(&resize);
+ if (r)
+ return r;
+
+ *new_root = resize.root;
+ return 0;
+}
+
+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
+ uint32_t old_size, uint32_t new_size,
+ const void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value)
+{
+ int r = array_resize(info, root, old_size, new_size, value, new_root);
+ __dm_unbless_for_disk(value);
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_resize);
+
+int dm_array_del(struct dm_array_info *info, dm_block_t root)
+{
+ return dm_btree_del(&info->btree_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_array_del);
+
+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, void *value_le)
+{
+ int r;
+ struct dm_block *block;
+ struct array_block *ab;
+ size_t size_of_block;
+ unsigned entry, max_entries;
+
+ size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+ max_entries = calc_max_entries(info->value_type.size, size_of_block);
+
+ r = lookup_ablock(info, root, index / max_entries, &block, &ab);
+ if (r)
+ return r;
+
+ entry = index % max_entries;
+ if (entry >= le32_to_cpu(ab->nr_entries))
+ r = -ENODATA;
+ else
+ memcpy(value_le, element_at(info, ab, entry),
+ info->value_type.size);
+
+ unlock_ablock(info, block);
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_get_value);
+
+static int array_set_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, const void *value, dm_block_t *new_root)
+{
+ int r;
+ struct dm_block *block;
+ struct array_block *ab;
+ size_t size_of_block;
+ unsigned max_entries;
+ unsigned entry;
+ void *old_value;
+ struct dm_btree_value_type *vt = &info->value_type;
+
+ size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+ max_entries = calc_max_entries(info->value_type.size, size_of_block);
+
+ r = shadow_ablock(info, &root, index / max_entries, &block, &ab);
+ if (r)
+ return r;
+ *new_root = root;
+
+ entry = index % max_entries;
+ if (entry >= le32_to_cpu(ab->nr_entries)) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ old_value = element_at(info, ab, entry);
+ if (vt->dec &&
+ (!vt->equal || !vt->equal(vt->context, old_value, value))) {
+ vt->dec(vt->context, old_value);
+ if (vt->inc)
+ vt->inc(vt->context, value);
+ }
+
+ memcpy(old_value, value, info->value_type.size);
+
+out:
+ unlock_ablock(info, block);
+ return r;
+}
+
+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, const void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value)
+{
+ int r;
+
+ r = array_set_value(info, root, index, value, new_root);
+ __dm_unbless_for_disk(value);
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_set_value);
+
+struct walk_info {
+ struct dm_array_info *info;
+ int (*fn)(void *context, uint64_t key, void *leaf);
+ void *context;
+};
+
+static int walk_ablock(void *context, uint64_t *keys, void *leaf)
+{
+ struct walk_info *wi = context;
+
+ int r;
+ unsigned i;
+ __le64 block_le;
+ unsigned nr_entries, max_entries;
+ struct dm_block *block;
+ struct array_block *ab;
+
+ memcpy(&block_le, leaf, sizeof(block_le));
+ r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab);
+ if (r)
+ return r;
+
+ max_entries = le32_to_cpu(ab->max_entries);
+ nr_entries = le32_to_cpu(ab->nr_entries);
+ for (i = 0; i < nr_entries; i++) {
+ r = wi->fn(wi->context, keys[0] * max_entries + i,
+ element_at(wi->info, ab, i));
+
+ if (r)
+ break;
+ }
+
+ unlock_ablock(wi->info, block);
+ return r;
+}
+
+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
+ int (*fn)(void *, uint64_t key, void *leaf),
+ void *context)
+{
+ struct walk_info wi;
+
+ wi.info = info;
+ wi.fn = fn;
+ wi.context = context;
+
+ return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi);
+}
+EXPORT_SYMBOL_GPL(dm_array_walk);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
new file mode 100644
index 000000000..ea177d6fa
--- /dev/null
+++ b/drivers/md/persistent-data/dm-array.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_ARRAY_H
+#define _LINUX_DM_ARRAY_H
+
+#include "dm-btree.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The dm-array is a persistent version of an array. It packs the data
+ * more efficiently than a btree which will result in less disk space use,
+ * and a performance boost. The element get and set operations are still
+ * O(ln(n)), but with a much smaller constant.
+ *
+ * The value type structure is reused from the btree type to support proper
+ * reference counting of values.
+ *
+ * The arrays implicitly know their length, and bounds are checked for
+ * lookups and updated. It doesn't store this in an accessible place
+ * because it would waste a whole metadata block. Make sure you store the
+ * size along with the array root in your encompassing data.
+ *
+ * Array entries are indexed via an unsigned integer starting from zero.
+ * Arrays are not sparse; if you resize an array to have 'n' entries then
+ * 'n - 1' will be the last valid index.
+ *
+ * Typical use:
+ *
+ * a) initialise a dm_array_info structure. This describes the array
+ * values and ties it into a specific transaction manager. It holds no
+ * instance data; the same info can be used for many similar arrays if
+ * you wish.
+ *
+ * b) Get yourself a root. The root is the index of a block of data on the
+ * disk that holds a particular instance of an array. You may have a
+ * pre existing root in your metadata that you wish to use, or you may
+ * want to create a brand new, empty array with dm_array_empty().
+ *
+ * Like the other data structures in this library, dm_array objects are
+ * immutable between transactions. Update functions will return you the
+ * root for a _new_ array. If you've incremented the old root, via
+ * dm_tm_inc(), before calling the update function you may continue to use
+ * it in parallel with the new root.
+ *
+ * c) resize an array with dm_array_resize().
+ *
+ * d) Get a value from the array with dm_array_get_value().
+ *
+ * e) Set a value in the array with dm_array_set_value().
+ *
+ * f) Walk an array of values in index order with dm_array_walk(). More
+ * efficient than making many calls to dm_array_get_value().
+ *
+ * g) Destroy the array with dm_array_del(). This tells the transaction
+ * manager that you're no longer using this data structure so it can
+ * recycle it's blocks. (dm_array_dec() would be a better name for it,
+ * but del is in keeping with dm_btree_del()).
+ */
+
+/*
+ * Describes an array. Don't initialise this structure yourself, use the
+ * init function below.
+ */
+struct dm_array_info {
+ struct dm_transaction_manager *tm;
+ struct dm_btree_value_type value_type;
+ struct dm_btree_info btree_info;
+};
+
+/*
+ * Sets up a dm_array_info structure. You don't need to do anything with
+ * this structure when you finish using it.
+ *
+ * info - the structure being filled in.
+ * tm - the transaction manager that should supervise this structure.
+ * vt - describes the leaf values.
+ */
+void dm_array_info_init(struct dm_array_info *info,
+ struct dm_transaction_manager *tm,
+ struct dm_btree_value_type *vt);
+
+/*
+ * Create an empty, zero length array.
+ *
+ * info - describes the array
+ * root - on success this will be filled out with the root block
+ */
+int dm_array_empty(struct dm_array_info *info, dm_block_t *root);
+
+/*
+ * Resizes the array.
+ *
+ * info - describes the array
+ * root - the root block of the array on disk
+ * old_size - the caller is responsible for remembering the size of
+ * the array
+ * new_size - can be bigger or smaller than old_size
+ * value - if we're growing the array the new entries will have this value
+ * new_root - on success, points to the new root block
+ *
+ * If growing the inc function for 'value' will be called the appropriate
+ * number of times. So if the caller is holding a reference they may want
+ * to drop it.
+ */
+int dm_array_resize(struct dm_array_info *info, dm_block_t root,
+ uint32_t old_size, uint32_t new_size,
+ const void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value);
+
+/*
+ * Frees a whole array. The value_type's decrement operation will be called
+ * for all values in the array
+ */
+int dm_array_del(struct dm_array_info *info, dm_block_t root);
+
+/*
+ * Lookup a value in the array
+ *
+ * info - describes the array
+ * root - root block of the array
+ * index - array index
+ * value - the value to be read. Will be in on-disk format of course.
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_array_get_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, void *value);
+
+/*
+ * Set an entry in the array.
+ *
+ * info - describes the array
+ * root - root block of the array
+ * index - array index
+ * value - value to be written to disk. Make sure you confirm the value is
+ * in on-disk format with__dm_bless_for_disk() before calling.
+ * new_root - the new root block
+ *
+ * The old value being overwritten will be decremented, the new value
+ * incremented.
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_array_set_value(struct dm_array_info *info, dm_block_t root,
+ uint32_t index, const void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value);
+
+/*
+ * Walk through all the entries in an array.
+ *
+ * info - describes the array
+ * root - root block of the array
+ * fn - called back for every element
+ * context - passed to the callback
+ */
+int dm_array_walk(struct dm_array_info *info, dm_block_t root,
+ int (*fn)(void *context, uint64_t key, void *leaf),
+ void *context);
+
+/*----------------------------------------------------------------*/
+
+#endif /* _LINUX_DM_ARRAY_H */
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
new file mode 100644
index 000000000..36f7cc2c7
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-bitset.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "bitset"
+#define BITS_PER_ARRAY_ENTRY 64
+
+/*----------------------------------------------------------------*/
+
+static struct dm_btree_value_type bitset_bvt = {
+ .context = NULL,
+ .size = sizeof(__le64),
+ .inc = NULL,
+ .dec = NULL,
+ .equal = NULL,
+};
+
+/*----------------------------------------------------------------*/
+
+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
+ struct dm_disk_bitset *info)
+{
+ dm_array_info_init(&info->array_info, tm, &bitset_bvt);
+ info->current_index_set = false;
+}
+EXPORT_SYMBOL_GPL(dm_disk_bitset_init);
+
+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
+{
+ return dm_array_empty(&info->array_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_empty);
+
+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t old_nr_entries, uint32_t new_nr_entries,
+ bool default_value, dm_block_t *new_root)
+{
+ uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY);
+ uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY);
+ __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0);
+
+ __dm_bless_for_disk(&value);
+ return dm_array_resize(&info->array_info, root, old_blocks, new_blocks,
+ &value, new_root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_resize);
+
+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root)
+{
+ return dm_array_del(&info->array_info, root);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_del);
+
+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
+ dm_block_t *new_root)
+{
+ int r;
+ __le64 value;
+
+ if (!info->current_index_set || !info->dirty)
+ return 0;
+
+ value = cpu_to_le64(info->current_bits);
+
+ __dm_bless_for_disk(&value);
+ r = dm_array_set_value(&info->array_info, root, info->current_index,
+ &value, new_root);
+ if (r)
+ return r;
+
+ info->current_index_set = false;
+ info->dirty = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_flush);
+
+static int read_bits(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t array_index)
+{
+ int r;
+ __le64 value;
+
+ r = dm_array_get_value(&info->array_info, root, array_index, &value);
+ if (r)
+ return r;
+
+ info->current_bits = le64_to_cpu(value);
+ info->current_index_set = true;
+ info->current_index = array_index;
+ info->dirty = false;
+
+ return 0;
+}
+
+static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root)
+{
+ int r;
+ unsigned array_index = index / BITS_PER_ARRAY_ENTRY;
+
+ if (info->current_index_set) {
+ if (info->current_index == array_index)
+ return 0;
+
+ r = dm_bitset_flush(info, root, new_root);
+ if (r)
+ return r;
+ }
+
+ return read_bits(info, root, array_index);
+}
+
+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root)
+{
+ int r;
+ unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+ r = get_array_entry(info, root, index, new_root);
+ if (r)
+ return r;
+
+ set_bit(b, (unsigned long *) &info->current_bits);
+ info->dirty = true;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_set_bit);
+
+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root)
+{
+ int r;
+ unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+ r = get_array_entry(info, root, index, new_root);
+ if (r)
+ return r;
+
+ clear_bit(b, (unsigned long *) &info->current_bits);
+ info->dirty = true;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_clear_bit);
+
+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root, bool *result)
+{
+ int r;
+ unsigned b = index % BITS_PER_ARRAY_ENTRY;
+
+ r = get_array_entry(info, root, index, new_root);
+ if (r)
+ return r;
+
+ *result = test_bit(b, (unsigned long *) &info->current_bits);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
new file mode 100644
index 000000000..c2287d672
--- /dev/null
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_BITSET_H
+#define _LINUX_DM_BITSET_H
+
+#include "dm-array.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This bitset type is a thin wrapper round a dm_array of 64bit words. It
+ * uses a tiny, one word cache to reduce the number of array lookups and so
+ * increase performance.
+ *
+ * Like the dm-array that it's based on, the caller needs to keep track of
+ * the size of the bitset separately. The underlying dm-array implicitly
+ * knows how many words it's storing and will return -ENODATA if you try
+ * and access an out of bounds word. However, an out of bounds bit in the
+ * final word will _not_ be detected, you have been warned.
+ *
+ * Bits are indexed from zero.
+
+ * Typical use:
+ *
+ * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init().
+ * This describes the bitset and includes the cache. It's not called it
+ * dm_bitset_info in line with other data structures because it does
+ * include instance data.
+ *
+ * b) Get yourself a root. The root is the index of a block of data on the
+ * disk that holds a particular instance of an bitset. You may have a
+ * pre existing root in your metadata that you wish to use, or you may
+ * want to create a brand new, empty bitset with dm_bitset_empty().
+ *
+ * Like the other data structures in this library, dm_bitset objects are
+ * immutable between transactions. Update functions will return you the
+ * root for a _new_ array. If you've incremented the old root, via
+ * dm_tm_inc(), before calling the update function you may continue to use
+ * it in parallel with the new root.
+ *
+ * Even read operations may trigger the cache to be flushed and as such
+ * return a root for a new, updated bitset.
+ *
+ * c) resize a bitset with dm_bitset_resize().
+ *
+ * d) Set a bit with dm_bitset_set_bit().
+ *
+ * e) Clear a bit with dm_bitset_clear_bit().
+ *
+ * f) Test a bit with dm_bitset_test_bit().
+ *
+ * g) Flush all updates from the cache with dm_bitset_flush().
+ *
+ * h) Destroy the bitset with dm_bitset_del(). This tells the transaction
+ * manager that you're no longer using this data structure so it can
+ * recycle it's blocks. (dm_bitset_dec() would be a better name for it,
+ * but del is in keeping with dm_btree_del()).
+ */
+
+/*
+ * Opaque object. Unlike dm_array_info, you should have one of these per
+ * bitset. Initialise with dm_disk_bitset_init().
+ */
+struct dm_disk_bitset {
+ struct dm_array_info array_info;
+
+ uint32_t current_index;
+ uint64_t current_bits;
+
+ bool current_index_set:1;
+ bool dirty:1;
+};
+
+/*
+ * Sets up a dm_disk_bitset structure. You don't need to do anything with
+ * this structure when you finish using it.
+ *
+ * tm - the transaction manager that should supervise this structure
+ * info - the structure being initialised
+ */
+void dm_disk_bitset_init(struct dm_transaction_manager *tm,
+ struct dm_disk_bitset *info);
+
+/*
+ * Create an empty, zero length bitset.
+ *
+ * info - describes the bitset
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
+
+/*
+ * Resize the bitset.
+ *
+ * info - describes the bitset
+ * old_root - the root block of the array on disk
+ * old_nr_entries - the number of bits in the old bitset
+ * new_nr_entries - the number of bits you want in the new bitset
+ * default_value - the value for any new bits
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root,
+ uint32_t old_nr_entries, uint32_t new_nr_entries,
+ bool default_value, dm_block_t *new_root);
+
+/*
+ * Frees the bitset.
+ */
+int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root);
+
+/*
+ * Set a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root);
+
+/*
+ * Clears a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root);
+
+/*
+ * Tests a bit.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * index - the bit index
+ * new_root - on success, points to the new root block (cached values may have been written)
+ * result - the bit value you're after
+ *
+ * -ENODATA will be returned if the index is out of bounds.
+ */
+int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
+ uint32_t index, dm_block_t *new_root, bool *result);
+
+/*
+ * Flush any cached changes to disk.
+ *
+ * info - describes the bitset
+ * root - the root block of the bitset
+ * new_root - on success, points to the new root block
+ */
+int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
+ dm_block_t *new_root);
+
+/*----------------------------------------------------------------*/
+
+#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
new file mode 100644
index 000000000..087411c95
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-block-manager.h"
+#include "dm-persistent-data-internal.h"
+#include "../dm-bufio.h"
+
+#include <linux/crc32c.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/device-mapper.h>
+#include <linux/stacktrace.h>
+
+#define DM_MSG_PREFIX "block manager"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This is a read/write semaphore with a couple of differences.
+ *
+ * i) There is a restriction on the number of concurrent read locks that
+ * may be held at once. This is just an implementation detail.
+ *
+ * ii) Recursive locking attempts are detected and return EINVAL. A stack
+ * trace is also emitted for the previous lock acquisition.
+ *
+ * iii) Priority is given to write locks.
+ */
+#define MAX_HOLDERS 4
+#define MAX_STACK 10
+
+typedef unsigned long stack_entries[MAX_STACK];
+
+struct block_lock {
+ spinlock_t lock;
+ __s32 count;
+ struct list_head waiters;
+ struct task_struct *holders[MAX_HOLDERS];
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ struct stack_trace traces[MAX_HOLDERS];
+ stack_entries entries[MAX_HOLDERS];
+#endif
+};
+
+struct waiter {
+ struct list_head list;
+ struct task_struct *task;
+ int wants_write;
+};
+
+static unsigned __find_holder(struct block_lock *lock,
+ struct task_struct *task)
+{
+ unsigned i;
+
+ for (i = 0; i < MAX_HOLDERS; i++)
+ if (lock->holders[i] == task)
+ break;
+
+ BUG_ON(i == MAX_HOLDERS);
+ return i;
+}
+
+/* call this *after* you increment lock->count */
+static void __add_holder(struct block_lock *lock, struct task_struct *task)
+{
+ unsigned h = __find_holder(lock, NULL);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ struct stack_trace *t;
+#endif
+
+ get_task_struct(task);
+ lock->holders[h] = task;
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ t = lock->traces + h;
+ t->nr_entries = 0;
+ t->max_entries = MAX_STACK;
+ t->entries = lock->entries[h];
+ t->skip = 2;
+ save_stack_trace(t);
+#endif
+}
+
+/* call this *before* you decrement lock->count */
+static void __del_holder(struct block_lock *lock, struct task_struct *task)
+{
+ unsigned h = __find_holder(lock, task);
+ lock->holders[h] = NULL;
+ put_task_struct(task);
+}
+
+static int __check_holder(struct block_lock *lock)
+{
+ unsigned i;
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ static struct stack_trace t;
+ static stack_entries entries;
+#endif
+
+ for (i = 0; i < MAX_HOLDERS; i++) {
+ if (lock->holders[i] == current) {
+ DMERR("recursive lock detected in metadata");
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ DMERR("previously held here:");
+ print_stack_trace(lock->traces + i, 4);
+
+ DMERR("subsequent acquisition attempted here:");
+ t.nr_entries = 0;
+ t.max_entries = MAX_STACK;
+ t.entries = entries;
+ t.skip = 3;
+ save_stack_trace(&t);
+ print_stack_trace(&t, 4);
+#endif
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static void __wait(struct waiter *w)
+{
+ for (;;) {
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+
+ if (!w->task)
+ break;
+
+ schedule();
+ }
+
+ set_task_state(current, TASK_RUNNING);
+}
+
+static void __wake_waiter(struct waiter *w)
+{
+ struct task_struct *task;
+
+ list_del(&w->list);
+ task = w->task;
+ smp_mb();
+ w->task = NULL;
+ wake_up_process(task);
+}
+
+/*
+ * We either wake a few readers or a single writer.
+ */
+static void __wake_many(struct block_lock *lock)
+{
+ struct waiter *w, *tmp;
+
+ BUG_ON(lock->count < 0);
+ list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
+ if (lock->count >= MAX_HOLDERS)
+ return;
+
+ if (w->wants_write) {
+ if (lock->count > 0)
+ return; /* still read locked */
+
+ lock->count = -1;
+ __add_holder(lock, w->task);
+ __wake_waiter(w);
+ return;
+ }
+
+ lock->count++;
+ __add_holder(lock, w->task);
+ __wake_waiter(w);
+ }
+}
+
+static void bl_init(struct block_lock *lock)
+{
+ int i;
+
+ spin_lock_init(&lock->lock);
+ lock->count = 0;
+ INIT_LIST_HEAD(&lock->waiters);
+ for (i = 0; i < MAX_HOLDERS; i++)
+ lock->holders[i] = NULL;
+}
+
+static int __available_for_read(struct block_lock *lock)
+{
+ return lock->count >= 0 &&
+ lock->count < MAX_HOLDERS &&
+ list_empty(&lock->waiters);
+}
+
+static int bl_down_read(struct block_lock *lock)
+{
+ int r;
+ struct waiter w;
+
+ spin_lock(&lock->lock);
+ r = __check_holder(lock);
+ if (r) {
+ spin_unlock(&lock->lock);
+ return r;
+ }
+
+ if (__available_for_read(lock)) {
+ lock->count++;
+ __add_holder(lock, current);
+ spin_unlock(&lock->lock);
+ return 0;
+ }
+
+ get_task_struct(current);
+
+ w.task = current;
+ w.wants_write = 0;
+ list_add_tail(&w.list, &lock->waiters);
+ spin_unlock(&lock->lock);
+
+ __wait(&w);
+ put_task_struct(current);
+ return 0;
+}
+
+static int bl_down_read_nonblock(struct block_lock *lock)
+{
+ int r;
+
+ spin_lock(&lock->lock);
+ r = __check_holder(lock);
+ if (r)
+ goto out;
+
+ if (__available_for_read(lock)) {
+ lock->count++;
+ __add_holder(lock, current);
+ r = 0;
+ } else
+ r = -EWOULDBLOCK;
+
+out:
+ spin_unlock(&lock->lock);
+ return r;
+}
+
+static void bl_up_read(struct block_lock *lock)
+{
+ spin_lock(&lock->lock);
+ BUG_ON(lock->count <= 0);
+ __del_holder(lock, current);
+ --lock->count;
+ if (!list_empty(&lock->waiters))
+ __wake_many(lock);
+ spin_unlock(&lock->lock);
+}
+
+static int bl_down_write(struct block_lock *lock)
+{
+ int r;
+ struct waiter w;
+
+ spin_lock(&lock->lock);
+ r = __check_holder(lock);
+ if (r) {
+ spin_unlock(&lock->lock);
+ return r;
+ }
+
+ if (lock->count == 0 && list_empty(&lock->waiters)) {
+ lock->count = -1;
+ __add_holder(lock, current);
+ spin_unlock(&lock->lock);
+ return 0;
+ }
+
+ get_task_struct(current);
+ w.task = current;
+ w.wants_write = 1;
+
+ /*
+ * Writers given priority. We know there's only one mutator in the
+ * system, so ignoring the ordering reversal.
+ */
+ list_add(&w.list, &lock->waiters);
+ spin_unlock(&lock->lock);
+
+ __wait(&w);
+ put_task_struct(current);
+
+ return 0;
+}
+
+static void bl_up_write(struct block_lock *lock)
+{
+ spin_lock(&lock->lock);
+ __del_holder(lock, current);
+ lock->count = 0;
+ if (!list_empty(&lock->waiters))
+ __wake_many(lock);
+ spin_unlock(&lock->lock);
+}
+
+static void report_recursive_bug(dm_block_t b, int r)
+{
+ if (r == -EINVAL)
+ DMERR("recursive acquisition of block %llu requested.",
+ (unsigned long long) b);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Block manager is currently implemented using dm-bufio. struct
+ * dm_block_manager and struct dm_block map directly onto a couple of
+ * structs in the bufio interface. I want to retain the freedom to move
+ * away from bufio in the future. So these structs are just cast within
+ * this .c file, rather than making it through to the public interface.
+ */
+static struct dm_buffer *to_buffer(struct dm_block *b)
+{
+ return (struct dm_buffer *) b;
+}
+
+dm_block_t dm_block_location(struct dm_block *b)
+{
+ return dm_bufio_get_block_number(to_buffer(b));
+}
+EXPORT_SYMBOL_GPL(dm_block_location);
+
+void *dm_block_data(struct dm_block *b)
+{
+ return dm_bufio_get_block_data(to_buffer(b));
+}
+EXPORT_SYMBOL_GPL(dm_block_data);
+
+struct buffer_aux {
+ struct dm_block_validator *validator;
+ struct block_lock lock;
+ int write_locked;
+};
+
+static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
+{
+ struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+ aux->validator = NULL;
+ bl_init(&aux->lock);
+}
+
+static void dm_block_manager_write_callback(struct dm_buffer *buf)
+{
+ struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+ if (aux->validator) {
+ aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
+ dm_bufio_get_block_size(dm_bufio_get_client(buf)));
+ }
+}
+
+/*----------------------------------------------------------------
+ * Public interface
+ *--------------------------------------------------------------*/
+struct dm_block_manager {
+ struct dm_bufio_client *bufio;
+ bool read_only:1;
+};
+
+struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
+ unsigned block_size,
+ unsigned cache_size,
+ unsigned max_held_per_thread)
+{
+ int r;
+ struct dm_block_manager *bm;
+
+ bm = kmalloc(sizeof(*bm), GFP_KERNEL);
+ if (!bm) {
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
+ sizeof(struct buffer_aux),
+ dm_block_manager_alloc_callback,
+ dm_block_manager_write_callback);
+ if (IS_ERR(bm->bufio)) {
+ r = PTR_ERR(bm->bufio);
+ kfree(bm);
+ goto bad;
+ }
+
+ bm->read_only = false;
+
+ return bm;
+
+bad:
+ return ERR_PTR(r);
+}
+EXPORT_SYMBOL_GPL(dm_block_manager_create);
+
+void dm_block_manager_destroy(struct dm_block_manager *bm)
+{
+ dm_bufio_client_destroy(bm->bufio);
+ kfree(bm);
+}
+EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
+
+unsigned dm_bm_block_size(struct dm_block_manager *bm)
+{
+ return dm_bufio_get_block_size(bm->bufio);
+}
+EXPORT_SYMBOL_GPL(dm_bm_block_size);
+
+dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
+{
+ return dm_bufio_get_device_size(bm->bufio);
+}
+
+static int dm_bm_validate_buffer(struct dm_block_manager *bm,
+ struct dm_buffer *buf,
+ struct buffer_aux *aux,
+ struct dm_block_validator *v)
+{
+ if (unlikely(!aux->validator)) {
+ int r;
+ if (!v)
+ return 0;
+ r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio));
+ if (unlikely(r)) {
+ DMERR_LIMIT("%s validator check failed for block %llu", v->name,
+ (unsigned long long) dm_bufio_get_block_number(buf));
+ return r;
+ }
+ aux->validator = v;
+ } else {
+ if (unlikely(aux->validator != v)) {
+ DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu",
+ aux->validator->name, v ? v->name : "NULL",
+ (unsigned long long) dm_bufio_get_block_number(buf));
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ struct buffer_aux *aux;
+ void *p;
+ int r;
+
+ p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
+ if (unlikely(IS_ERR(p)))
+ return PTR_ERR(p);
+
+ aux = dm_bufio_get_aux_data(to_buffer(*result));
+ r = bl_down_read(&aux->lock);
+ if (unlikely(r)) {
+ dm_bufio_release(to_buffer(*result));
+ report_recursive_bug(b, r);
+ return r;
+ }
+
+ aux->write_locked = 0;
+
+ r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+ if (unlikely(r)) {
+ bl_up_read(&aux->lock);
+ dm_bufio_release(to_buffer(*result));
+ return r;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_read_lock);
+
+int dm_bm_write_lock(struct dm_block_manager *bm,
+ dm_block_t b, struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ struct buffer_aux *aux;
+ void *p;
+ int r;
+
+ if (bm->read_only)
+ return -EPERM;
+
+ p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
+ if (unlikely(IS_ERR(p)))
+ return PTR_ERR(p);
+
+ aux = dm_bufio_get_aux_data(to_buffer(*result));
+ r = bl_down_write(&aux->lock);
+ if (r) {
+ dm_bufio_release(to_buffer(*result));
+ report_recursive_bug(b, r);
+ return r;
+ }
+
+ aux->write_locked = 1;
+
+ r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+ if (unlikely(r)) {
+ bl_up_write(&aux->lock);
+ dm_bufio_release(to_buffer(*result));
+ return r;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_write_lock);
+
+int dm_bm_read_try_lock(struct dm_block_manager *bm,
+ dm_block_t b, struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ struct buffer_aux *aux;
+ void *p;
+ int r;
+
+ p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
+ if (unlikely(IS_ERR(p)))
+ return PTR_ERR(p);
+ if (unlikely(!p))
+ return -EWOULDBLOCK;
+
+ aux = dm_bufio_get_aux_data(to_buffer(*result));
+ r = bl_down_read_nonblock(&aux->lock);
+ if (r < 0) {
+ dm_bufio_release(to_buffer(*result));
+ report_recursive_bug(b, r);
+ return r;
+ }
+ aux->write_locked = 0;
+
+ r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+ if (unlikely(r)) {
+ bl_up_read(&aux->lock);
+ dm_bufio_release(to_buffer(*result));
+ return r;
+ }
+
+ return 0;
+}
+
+int dm_bm_write_lock_zero(struct dm_block_manager *bm,
+ dm_block_t b, struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ int r;
+ struct buffer_aux *aux;
+ void *p;
+
+ if (bm->read_only)
+ return -EPERM;
+
+ p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
+ if (unlikely(IS_ERR(p)))
+ return PTR_ERR(p);
+
+ memset(p, 0, dm_bm_block_size(bm));
+
+ aux = dm_bufio_get_aux_data(to_buffer(*result));
+ r = bl_down_write(&aux->lock);
+ if (r) {
+ dm_bufio_release(to_buffer(*result));
+ return r;
+ }
+
+ aux->write_locked = 1;
+ aux->validator = v;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero);
+
+int dm_bm_unlock(struct dm_block *b)
+{
+ struct buffer_aux *aux;
+ aux = dm_bufio_get_aux_data(to_buffer(b));
+
+ if (aux->write_locked) {
+ dm_bufio_mark_buffer_dirty(to_buffer(b));
+ bl_up_write(&aux->lock);
+ } else
+ bl_up_read(&aux->lock);
+
+ dm_bufio_release(to_buffer(b));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bm_unlock);
+
+int dm_bm_flush(struct dm_block_manager *bm)
+{
+ if (bm->read_only)
+ return -EPERM;
+
+ return dm_bufio_write_dirty_buffers(bm->bufio);
+}
+EXPORT_SYMBOL_GPL(dm_bm_flush);
+
+void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
+{
+ dm_bufio_prefetch(bm->bufio, b, 1);
+}
+
+void dm_bm_set_read_only(struct dm_block_manager *bm)
+{
+ bm->read_only = true;
+}
+EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
+
+void dm_bm_set_read_write(struct dm_block_manager *bm)
+{
+ bm->read_only = false;
+}
+EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
+
+u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
+{
+ return crc32c(~(u32) 0, data, len) ^ init_xor;
+}
+EXPORT_SYMBOL_GPL(dm_bm_checksum);
+
+/*----------------------------------------------------------------*/
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_DESCRIPTION("Immutable metadata library for dm");
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
new file mode 100644
index 000000000..1b95dfc17
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _LINUX_DM_BLOCK_MANAGER_H
+#define _LINUX_DM_BLOCK_MANAGER_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Block number.
+ */
+typedef uint64_t dm_block_t;
+struct dm_block;
+
+dm_block_t dm_block_location(struct dm_block *b);
+void *dm_block_data(struct dm_block *b);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * @name should be a unique identifier for the block manager, no longer
+ * than 32 chars.
+ *
+ * @max_held_per_thread should be the maximum number of locks, read or
+ * write, that an individual thread holds at any one time.
+ */
+struct dm_block_manager;
+struct dm_block_manager *dm_block_manager_create(
+ struct block_device *bdev, unsigned block_size,
+ unsigned cache_size, unsigned max_held_per_thread);
+void dm_block_manager_destroy(struct dm_block_manager *bm);
+
+unsigned dm_bm_block_size(struct dm_block_manager *bm);
+dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The validator allows the caller to verify newly-read data and modify
+ * the data just before writing, e.g. to calculate checksums. It's
+ * important to be consistent with your use of validators. The only time
+ * you can change validators is if you call dm_bm_write_lock_zero.
+ */
+struct dm_block_validator {
+ const char *name;
+ void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
+
+ /*
+ * Return 0 if the checksum is valid or < 0 on error.
+ */
+ int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * You can have multiple concurrent readers or a single writer holding a
+ * block lock.
+ */
+
+/*
+ * dm_bm_lock() locks a block and returns through @result a pointer to
+ * memory that holds a copy of that block. If you have write-locked the
+ * block then any changes you make to memory pointed to by @result will be
+ * written back to the disk sometime after dm_bm_unlock is called.
+ */
+int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+/*
+ * The *_try_lock variants return -EWOULDBLOCK if the block isn't
+ * available immediately.
+ */
+int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+/*
+ * Use dm_bm_write_lock_zero() when you know you're going to
+ * overwrite the block completely. It saves a disk read.
+ */
+int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+int dm_bm_unlock(struct dm_block *b);
+
+/*
+ * It's a common idiom to have a superblock that should be committed last.
+ *
+ * @superblock should be write-locked on entry. It will be unlocked during
+ * this function. All dirty blocks are guaranteed to be written and flushed
+ * before the superblock.
+ *
+ * This method always blocks.
+ */
+int dm_bm_flush(struct dm_block_manager *bm);
+
+/*
+ * Request data is prefetched into the cache.
+ */
+void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
+
+/*
+ * Switches the bm to a read only mode. Once read-only mode
+ * has been entered the following functions will return -EPERM.
+ *
+ * dm_bm_write_lock
+ * dm_bm_write_lock_zero
+ * dm_bm_flush_and_unlock
+ *
+ * Additionally you should not use dm_bm_unlock_move, however no error will
+ * be returned if you do.
+ */
+void dm_bm_set_read_only(struct dm_block_manager *bm);
+void dm_bm_set_read_write(struct dm_block_manager *bm);
+
+u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
+
+/*----------------------------------------------------------------*/
+
+#endif /* _LINUX_DM_BLOCK_MANAGER_H */
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
new file mode 100644
index 000000000..bf2b80d5c
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BTREE_INTERNAL_H
+#define DM_BTREE_INTERNAL_H
+
+#include "dm-btree.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * We'll need 2 accessor functions for n->csum and n->blocknr
+ * to support dm-btree-spine.c in that case.
+ */
+
+enum node_flags {
+ INTERNAL_NODE = 1,
+ LEAF_NODE = 1 << 1
+};
+
+/*
+ * Every btree node begins with this structure. Make sure it's a multiple
+ * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned.
+ */
+struct node_header {
+ __le32 csum;
+ __le32 flags;
+ __le64 blocknr; /* Block this node is supposed to live in. */
+
+ __le32 nr_entries;
+ __le32 max_entries;
+ __le32 value_size;
+ __le32 padding;
+} __packed;
+
+struct btree_node {
+ struct node_header header;
+ __le64 keys[0];
+} __packed;
+
+
+/*
+ * Locks a block using the btree node validator.
+ */
+int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
+ struct dm_block **result);
+
+void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
+ struct dm_btree_value_type *vt);
+
+int new_block(struct dm_btree_info *info, struct dm_block **result);
+int unlock_block(struct dm_btree_info *info, struct dm_block *b);
+
+/*
+ * Spines keep track of the rolling locks. There are 2 variants, read-only
+ * and one that uses shadowing. These are separate structs to allow the
+ * type checker to spot misuse, for example accidentally calling read_lock
+ * on a shadow spine.
+ */
+struct ro_spine {
+ struct dm_btree_info *info;
+
+ int count;
+ struct dm_block *nodes[2];
+};
+
+void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
+int exit_ro_spine(struct ro_spine *s);
+int ro_step(struct ro_spine *s, dm_block_t new_child);
+void ro_pop(struct ro_spine *s);
+struct btree_node *ro_node(struct ro_spine *s);
+
+struct shadow_spine {
+ struct dm_btree_info *info;
+
+ int count;
+ struct dm_block *nodes[2];
+
+ dm_block_t root;
+};
+
+void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info);
+int exit_shadow_spine(struct shadow_spine *s);
+
+int shadow_step(struct shadow_spine *s, dm_block_t b,
+ struct dm_btree_value_type *vt);
+
+/*
+ * The spine must have at least one entry before calling this.
+ */
+struct dm_block *shadow_current(struct shadow_spine *s);
+
+/*
+ * The spine must have at least two entries before calling this.
+ */
+struct dm_block *shadow_parent(struct shadow_spine *s);
+
+int shadow_has_parent(struct shadow_spine *s);
+
+int shadow_root(struct shadow_spine *s);
+
+/*
+ * Some inlines.
+ */
+static inline __le64 *key_ptr(struct btree_node *n, uint32_t index)
+{
+ return n->keys + index;
+}
+
+static inline void *value_base(struct btree_node *n)
+{
+ return &n->keys[le32_to_cpu(n->header.max_entries)];
+}
+
+static inline void *value_ptr(struct btree_node *n, uint32_t index)
+{
+ uint32_t value_size = le32_to_cpu(n->header.value_size);
+ return value_base(n) + (value_size * index);
+}
+
+/*
+ * Assumes the values are suitably-aligned and converts to core format.
+ */
+static inline uint64_t value64(struct btree_node *n, uint32_t index)
+{
+ __le64 *values_le = value_base(n);
+
+ return le64_to_cpu(values_le[index]);
+}
+
+/*
+ * Searching for a key within a single node.
+ */
+int lower_bound(struct btree_node *n, uint64_t key);
+
+extern struct dm_block_validator btree_node_validator;
+
+#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
new file mode 100644
index 000000000..a03178e91
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -0,0 +1,592 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-btree.h"
+#include "dm-btree-internal.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+
+/*
+ * Removing an entry from a btree
+ * ==============================
+ *
+ * A very important constraint for our btree is that no node, except the
+ * root, may have fewer than a certain number of entries.
+ * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
+ *
+ * Ensuring this is complicated by the way we want to only ever hold the
+ * locks on 2 nodes concurrently, and only change nodes in a top to bottom
+ * fashion.
+ *
+ * Each node may have a left or right sibling. When decending the spine,
+ * if a node contains only MIN_ENTRIES then we try and increase this to at
+ * least MIN_ENTRIES + 1. We do this in the following ways:
+ *
+ * [A] No siblings => this can only happen if the node is the root, in which
+ * case we copy the childs contents over the root.
+ *
+ * [B] No left sibling
+ * ==> rebalance(node, right sibling)
+ *
+ * [C] No right sibling
+ * ==> rebalance(left sibling, node)
+ *
+ * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
+ * ==> delete node adding it's contents to left and right
+ *
+ * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
+ * ==> rebalance(left, node, right)
+ *
+ * After these operations it's possible that the our original node no
+ * longer contains the desired sub tree. For this reason this rebalancing
+ * is performed on the children of the current node. This also avoids
+ * having a special case for the root.
+ *
+ * Once this rebalancing has occurred we can then step into the child node
+ * for internal nodes. Or delete the entry for leaf nodes.
+ */
+
+/*
+ * Some little utilities for moving node data around.
+ */
+static void node_shift(struct btree_node *n, int shift)
+{
+ uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
+ uint32_t value_size = le32_to_cpu(n->header.value_size);
+
+ if (shift < 0) {
+ shift = -shift;
+ BUG_ON(shift > nr_entries);
+ BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
+ memmove(key_ptr(n, 0),
+ key_ptr(n, shift),
+ (nr_entries - shift) * sizeof(__le64));
+ memmove(value_ptr(n, 0),
+ value_ptr(n, shift),
+ (nr_entries - shift) * value_size);
+ } else {
+ BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
+ memmove(key_ptr(n, shift),
+ key_ptr(n, 0),
+ nr_entries * sizeof(__le64));
+ memmove(value_ptr(n, shift),
+ value_ptr(n, 0),
+ nr_entries * value_size);
+ }
+}
+
+static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
+{
+ uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+ uint32_t value_size = le32_to_cpu(left->header.value_size);
+ BUG_ON(value_size != le32_to_cpu(right->header.value_size));
+
+ if (shift < 0) {
+ shift = -shift;
+ BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
+ memcpy(key_ptr(left, nr_left),
+ key_ptr(right, 0),
+ shift * sizeof(__le64));
+ memcpy(value_ptr(left, nr_left),
+ value_ptr(right, 0),
+ shift * value_size);
+ } else {
+ BUG_ON(shift > le32_to_cpu(right->header.max_entries));
+ memcpy(key_ptr(right, 0),
+ key_ptr(left, nr_left - shift),
+ shift * sizeof(__le64));
+ memcpy(value_ptr(right, 0),
+ value_ptr(left, nr_left - shift),
+ shift * value_size);
+ }
+}
+
+/*
+ * Delete a specific entry from a leaf node.
+ */
+static void delete_at(struct btree_node *n, unsigned index)
+{
+ unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
+ unsigned nr_to_copy = nr_entries - (index + 1);
+ uint32_t value_size = le32_to_cpu(n->header.value_size);
+ BUG_ON(index >= nr_entries);
+
+ if (nr_to_copy) {
+ memmove(key_ptr(n, index),
+ key_ptr(n, index + 1),
+ nr_to_copy * sizeof(__le64));
+
+ memmove(value_ptr(n, index),
+ value_ptr(n, index + 1),
+ nr_to_copy * value_size);
+ }
+
+ n->header.nr_entries = cpu_to_le32(nr_entries - 1);
+}
+
+static unsigned merge_threshold(struct btree_node *n)
+{
+ return le32_to_cpu(n->header.max_entries) / 3;
+}
+
+struct child {
+ unsigned index;
+ struct dm_block *block;
+ struct btree_node *n;
+};
+
+static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
+ struct btree_node *parent,
+ unsigned index, struct child *result)
+{
+ int r, inc;
+ dm_block_t root;
+
+ result->index = index;
+ root = value64(parent, index);
+
+ r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
+ &result->block, &inc);
+ if (r)
+ return r;
+
+ result->n = dm_block_data(result->block);
+
+ if (inc)
+ inc_children(info->tm, result->n, vt);
+
+ *((__le64 *) value_ptr(parent, index)) =
+ cpu_to_le64(dm_block_location(result->block));
+
+ return 0;
+}
+
+static int exit_child(struct dm_btree_info *info, struct child *c)
+{
+ return dm_tm_unlock(info->tm, c->block);
+}
+
+static void shift(struct btree_node *left, struct btree_node *right, int count)
+{
+ uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+ uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
+
+ BUG_ON(max_entries != r_max_entries);
+ BUG_ON(nr_left - count > max_entries);
+ BUG_ON(nr_right + count > max_entries);
+
+ if (!count)
+ return;
+
+ if (count > 0) {
+ node_shift(right, count);
+ node_copy(left, right, count);
+ } else {
+ node_copy(left, right, count);
+ node_shift(right, count);
+ }
+
+ left->header.nr_entries = cpu_to_le32(nr_left - count);
+ right->header.nr_entries = cpu_to_le32(nr_right + count);
+}
+
+static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
+ struct child *l, struct child *r)
+{
+ struct btree_node *left = l->n;
+ struct btree_node *right = r->n;
+ uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+ uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ unsigned threshold = 2 * merge_threshold(left) + 1;
+
+ if (nr_left + nr_right < threshold) {
+ /*
+ * Merge
+ */
+ node_copy(left, right, -nr_right);
+ left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
+ delete_at(parent, r->index);
+
+ /*
+ * We need to decrement the right block, but not it's
+ * children, since they're still referenced by left.
+ */
+ dm_tm_dec(info->tm, dm_block_location(r->block));
+ } else {
+ /*
+ * Rebalance.
+ */
+ unsigned target_left = (nr_left + nr_right) / 2;
+ shift(left, right, nr_left - target_left);
+ *key_ptr(parent, r->index) = right->keys[0];
+ }
+}
+
+static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
+ struct dm_btree_value_type *vt, unsigned left_index)
+{
+ int r;
+ struct btree_node *parent;
+ struct child left, right;
+
+ parent = dm_block_data(shadow_current(s));
+
+ r = init_child(info, vt, parent, left_index, &left);
+ if (r)
+ return r;
+
+ r = init_child(info, vt, parent, left_index + 1, &right);
+ if (r) {
+ exit_child(info, &left);
+ return r;
+ }
+
+ __rebalance2(info, parent, &left, &right);
+
+ r = exit_child(info, &left);
+ if (r) {
+ exit_child(info, &right);
+ return r;
+ }
+
+ return exit_child(info, &right);
+}
+
+/*
+ * We dump as many entries from center as possible into left, then the rest
+ * in right, then rebalance2. This wastes some cpu, but I want something
+ * simple atm.
+ */
+static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
+ struct child *l, struct child *c, struct child *r,
+ struct btree_node *left, struct btree_node *center, struct btree_node *right,
+ uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ unsigned shift = min(max_entries - nr_left, nr_center);
+
+ BUG_ON(nr_left + shift > max_entries);
+ node_copy(left, center, -shift);
+ left->header.nr_entries = cpu_to_le32(nr_left + shift);
+
+ if (shift != nr_center) {
+ shift = nr_center - shift;
+ BUG_ON((nr_right + shift) > max_entries);
+ node_shift(right, shift);
+ node_copy(center, right, shift);
+ right->header.nr_entries = cpu_to_le32(nr_right + shift);
+ }
+ *key_ptr(parent, r->index) = right->keys[0];
+
+ delete_at(parent, c->index);
+ r->index--;
+
+ dm_tm_dec(info->tm, dm_block_location(c->block));
+ __rebalance2(info, parent, l, r);
+}
+
+/*
+ * Redistributes entries among 3 sibling nodes.
+ */
+static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
+ struct child *l, struct child *c, struct child *r,
+ struct btree_node *left, struct btree_node *center, struct btree_node *right,
+ uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+ int s;
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ unsigned target = (nr_left + nr_center + nr_right) / 3;
+ BUG_ON(target > max_entries);
+
+ if (nr_left < nr_right) {
+ s = nr_left - target;
+
+ if (s < 0 && nr_center < -s) {
+ /* not enough in central node */
+ shift(left, center, -nr_center);
+ s += nr_center;
+ shift(left, right, s);
+ nr_right += s;
+ } else
+ shift(left, center, s);
+
+ shift(center, right, target - nr_right);
+
+ } else {
+ s = target - nr_right;
+ if (s > 0 && nr_center < s) {
+ /* not enough in central node */
+ shift(center, right, nr_center);
+ s -= nr_center;
+ shift(left, right, s);
+ nr_left -= s;
+ } else
+ shift(center, right, s);
+
+ shift(left, center, nr_left - target);
+ }
+
+ *key_ptr(parent, c->index) = center->keys[0];
+ *key_ptr(parent, r->index) = right->keys[0];
+}
+
+static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
+ struct child *l, struct child *c, struct child *r)
+{
+ struct btree_node *left = l->n;
+ struct btree_node *center = c->n;
+ struct btree_node *right = r->n;
+
+ uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+ uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
+ uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+
+ unsigned threshold = merge_threshold(left) * 4 + 1;
+
+ BUG_ON(left->header.max_entries != center->header.max_entries);
+ BUG_ON(center->header.max_entries != right->header.max_entries);
+
+ if ((nr_left + nr_center + nr_right) < threshold)
+ delete_center_node(info, parent, l, c, r, left, center, right,
+ nr_left, nr_center, nr_right);
+ else
+ redistribute3(info, parent, l, c, r, left, center, right,
+ nr_left, nr_center, nr_right);
+}
+
+static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
+ struct dm_btree_value_type *vt, unsigned left_index)
+{
+ int r;
+ struct btree_node *parent = dm_block_data(shadow_current(s));
+ struct child left, center, right;
+
+ /*
+ * FIXME: fill out an array?
+ */
+ r = init_child(info, vt, parent, left_index, &left);
+ if (r)
+ return r;
+
+ r = init_child(info, vt, parent, left_index + 1, &center);
+ if (r) {
+ exit_child(info, &left);
+ return r;
+ }
+
+ r = init_child(info, vt, parent, left_index + 2, &right);
+ if (r) {
+ exit_child(info, &left);
+ exit_child(info, &center);
+ return r;
+ }
+
+ __rebalance3(info, parent, &left, &center, &right);
+
+ r = exit_child(info, &left);
+ if (r) {
+ exit_child(info, &center);
+ exit_child(info, &right);
+ return r;
+ }
+
+ r = exit_child(info, &center);
+ if (r) {
+ exit_child(info, &right);
+ return r;
+ }
+
+ r = exit_child(info, &right);
+ if (r)
+ return r;
+
+ return 0;
+}
+
+static int get_nr_entries(struct dm_transaction_manager *tm,
+ dm_block_t b, uint32_t *result)
+{
+ int r;
+ struct dm_block *block;
+ struct btree_node *n;
+
+ r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
+ if (r)
+ return r;
+
+ n = dm_block_data(block);
+ *result = le32_to_cpu(n->header.nr_entries);
+
+ return dm_tm_unlock(tm, block);
+}
+
+static int rebalance_children(struct shadow_spine *s,
+ struct dm_btree_info *info,
+ struct dm_btree_value_type *vt, uint64_t key)
+{
+ int i, r, has_left_sibling, has_right_sibling;
+ uint32_t child_entries;
+ struct btree_node *n;
+
+ n = dm_block_data(shadow_current(s));
+
+ if (le32_to_cpu(n->header.nr_entries) == 1) {
+ struct dm_block *child;
+ dm_block_t b = value64(n, 0);
+
+ r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
+ if (r)
+ return r;
+
+ memcpy(n, dm_block_data(child),
+ dm_bm_block_size(dm_tm_get_bm(info->tm)));
+ r = dm_tm_unlock(info->tm, child);
+ if (r)
+ return r;
+
+ dm_tm_dec(info->tm, dm_block_location(child));
+ return 0;
+ }
+
+ i = lower_bound(n, key);
+ if (i < 0)
+ return -ENODATA;
+
+ r = get_nr_entries(info->tm, value64(n, i), &child_entries);
+ if (r)
+ return r;
+
+ has_left_sibling = i > 0;
+ has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
+
+ if (!has_left_sibling)
+ r = rebalance2(s, info, vt, i);
+
+ else if (!has_right_sibling)
+ r = rebalance2(s, info, vt, i - 1);
+
+ else
+ r = rebalance3(s, info, vt, i - 1);
+
+ return r;
+}
+
+static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
+{
+ int i = lower_bound(n, key);
+
+ if ((i < 0) ||
+ (i >= le32_to_cpu(n->header.nr_entries)) ||
+ (le64_to_cpu(n->keys[i]) != key))
+ return -ENODATA;
+
+ *index = i;
+
+ return 0;
+}
+
+/*
+ * Prepares for removal from one level of the hierarchy. The caller must
+ * call delete_at() to remove the entry at index.
+ */
+static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
+ struct dm_btree_value_type *vt, dm_block_t root,
+ uint64_t key, unsigned *index)
+{
+ int i = *index, r;
+ struct btree_node *n;
+
+ for (;;) {
+ r = shadow_step(s, root, vt);
+ if (r < 0)
+ break;
+
+ /*
+ * We have to patch up the parent node, ugly, but I don't
+ * see a way to do this automatically as part of the spine
+ * op.
+ */
+ if (shadow_has_parent(s)) {
+ __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
+ memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
+ &location, sizeof(__le64));
+ }
+
+ n = dm_block_data(shadow_current(s));
+
+ if (le32_to_cpu(n->header.flags) & LEAF_NODE)
+ return do_leaf(n, key, index);
+
+ r = rebalance_children(s, info, vt, key);
+ if (r)
+ break;
+
+ n = dm_block_data(shadow_current(s));
+ if (le32_to_cpu(n->header.flags) & LEAF_NODE)
+ return do_leaf(n, key, index);
+
+ i = lower_bound(n, key);
+
+ /*
+ * We know the key is present, or else
+ * rebalance_children would have returned
+ * -ENODATA
+ */
+ root = value64(n, i);
+ }
+
+ return r;
+}
+
+static struct dm_btree_value_type le64_type = {
+ .context = NULL,
+ .size = sizeof(__le64),
+ .inc = NULL,
+ .dec = NULL,
+ .equal = NULL
+};
+
+int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, dm_block_t *new_root)
+{
+ unsigned level, last_level = info->levels - 1;
+ int index = 0, r = 0;
+ struct shadow_spine spine;
+ struct btree_node *n;
+
+ init_shadow_spine(&spine, info);
+ for (level = 0; level < info->levels; level++) {
+ r = remove_raw(&spine, info,
+ (level == last_level ?
+ &info->value_type : &le64_type),
+ root, keys[level], (unsigned *)&index);
+ if (r < 0)
+ break;
+
+ n = dm_block_data(shadow_current(&spine));
+ if (level != last_level) {
+ root = value64(n, index);
+ continue;
+ }
+
+ BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
+
+ if (info->value_type.dec)
+ info->value_type.dec(info->value_type.context,
+ value_ptr(n, index));
+
+ delete_at(n, index);
+ }
+
+ *new_root = shadow_root(&spine);
+ exit_shadow_spine(&spine);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_remove);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
new file mode 100644
index 000000000..1b5e13ec7
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-btree-internal.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "btree spine"
+
+/*----------------------------------------------------------------*/
+
+#define BTREE_CSUM_XOR 121107
+
+static int node_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size);
+
+static void node_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size)
+{
+ struct btree_node *n = dm_block_data(b);
+ struct node_header *h = &n->header;
+
+ h->blocknr = cpu_to_le64(dm_block_location(b));
+ h->csum = cpu_to_le32(dm_bm_checksum(&h->flags,
+ block_size - sizeof(__le32),
+ BTREE_CSUM_XOR));
+
+ BUG_ON(node_check(v, b, 4096));
+}
+
+static int node_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size)
+{
+ struct btree_node *n = dm_block_data(b);
+ struct node_header *h = &n->header;
+ size_t value_size;
+ __le32 csum_disk;
+ uint32_t flags;
+
+ if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
+ DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu",
+ le64_to_cpu(h->blocknr), dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags,
+ block_size - sizeof(__le32),
+ BTREE_CSUM_XOR));
+ if (csum_disk != h->csum) {
+ DMERR_LIMIT("node_check failed: csum %u != wanted %u",
+ le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
+ return -EILSEQ;
+ }
+
+ value_size = le32_to_cpu(h->value_size);
+
+ if (sizeof(struct node_header) +
+ (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) {
+ DMERR_LIMIT("node_check failed: max_entries too large");
+ return -EILSEQ;
+ }
+
+ if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) {
+ DMERR_LIMIT("node_check failed: too many entries");
+ return -EILSEQ;
+ }
+
+ /*
+ * The node must be either INTERNAL or LEAF.
+ */
+ flags = le32_to_cpu(h->flags);
+ if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
+ DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF");
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+struct dm_block_validator btree_node_validator = {
+ .name = "btree_node",
+ .prepare_for_write = node_prepare_for_write,
+ .check = node_check
+};
+
+/*----------------------------------------------------------------*/
+
+int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
+ struct dm_block **result)
+{
+ return dm_tm_read_lock(info->tm, b, &btree_node_validator, result);
+}
+
+static int bn_shadow(struct dm_btree_info *info, dm_block_t orig,
+ struct dm_btree_value_type *vt,
+ struct dm_block **result)
+{
+ int r, inc;
+
+ r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator,
+ result, &inc);
+ if (!r && inc)
+ inc_children(info->tm, dm_block_data(*result), vt);
+
+ return r;
+}
+
+int new_block(struct dm_btree_info *info, struct dm_block **result)
+{
+ return dm_tm_new_block(info->tm, &btree_node_validator, result);
+}
+
+int unlock_block(struct dm_btree_info *info, struct dm_block *b)
+{
+ return dm_tm_unlock(info->tm, b);
+}
+
+/*----------------------------------------------------------------*/
+
+void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info)
+{
+ s->info = info;
+ s->count = 0;
+ s->nodes[0] = NULL;
+ s->nodes[1] = NULL;
+}
+
+int exit_ro_spine(struct ro_spine *s)
+{
+ int r = 0, i;
+
+ for (i = 0; i < s->count; i++) {
+ int r2 = unlock_block(s->info, s->nodes[i]);
+ if (r2 < 0)
+ r = r2;
+ }
+
+ return r;
+}
+
+int ro_step(struct ro_spine *s, dm_block_t new_child)
+{
+ int r;
+
+ if (s->count == 2) {
+ r = unlock_block(s->info, s->nodes[0]);
+ if (r < 0)
+ return r;
+ s->nodes[0] = s->nodes[1];
+ s->count--;
+ }
+
+ r = bn_read_lock(s->info, new_child, s->nodes + s->count);
+ if (!r)
+ s->count++;
+
+ return r;
+}
+
+void ro_pop(struct ro_spine *s)
+{
+ BUG_ON(!s->count);
+ --s->count;
+ unlock_block(s->info, s->nodes[s->count]);
+}
+
+struct btree_node *ro_node(struct ro_spine *s)
+{
+ struct dm_block *block;
+
+ BUG_ON(!s->count);
+ block = s->nodes[s->count - 1];
+
+ return dm_block_data(block);
+}
+
+/*----------------------------------------------------------------*/
+
+void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info)
+{
+ s->info = info;
+ s->count = 0;
+}
+
+int exit_shadow_spine(struct shadow_spine *s)
+{
+ int r = 0, i;
+
+ for (i = 0; i < s->count; i++) {
+ int r2 = unlock_block(s->info, s->nodes[i]);
+ if (r2 < 0)
+ r = r2;
+ }
+
+ return r;
+}
+
+int shadow_step(struct shadow_spine *s, dm_block_t b,
+ struct dm_btree_value_type *vt)
+{
+ int r;
+
+ if (s->count == 2) {
+ r = unlock_block(s->info, s->nodes[0]);
+ if (r < 0)
+ return r;
+ s->nodes[0] = s->nodes[1];
+ s->count--;
+ }
+
+ r = bn_shadow(s->info, b, vt, s->nodes + s->count);
+ if (!r) {
+ if (!s->count)
+ s->root = dm_block_location(s->nodes[0]);
+
+ s->count++;
+ }
+
+ return r;
+}
+
+struct dm_block *shadow_current(struct shadow_spine *s)
+{
+ BUG_ON(!s->count);
+
+ return s->nodes[s->count - 1];
+}
+
+struct dm_block *shadow_parent(struct shadow_spine *s)
+{
+ BUG_ON(s->count != 2);
+
+ return s->count == 2 ? s->nodes[0] : NULL;
+}
+
+int shadow_has_parent(struct shadow_spine *s)
+{
+ return s->count >= 2;
+}
+
+int shadow_root(struct shadow_spine *s)
+{
+ return s->root;
+}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
new file mode 100644
index 000000000..fdd3793e2
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -0,0 +1,892 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-btree-internal.h"
+#include "dm-space-map.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "btree"
+
+/*----------------------------------------------------------------
+ * Array manipulation
+ *--------------------------------------------------------------*/
+static void memcpy_disk(void *dest, const void *src, size_t len)
+ __dm_written_to_disk(src)
+{
+ memcpy(dest, src, len);
+ __dm_unbless_for_disk(src);
+}
+
+static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
+ unsigned index, void *elt)
+ __dm_written_to_disk(elt)
+{
+ if (index < nr_elts)
+ memmove(base + (elt_size * (index + 1)),
+ base + (elt_size * index),
+ (nr_elts - index) * elt_size);
+
+ memcpy_disk(base + (elt_size * index), elt, elt_size);
+}
+
+/*----------------------------------------------------------------*/
+
+/* makes the assumption that no two keys are the same. */
+static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
+{
+ int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
+
+ while (hi - lo > 1) {
+ int mid = lo + ((hi - lo) / 2);
+ uint64_t mid_key = le64_to_cpu(n->keys[mid]);
+
+ if (mid_key == key)
+ return mid;
+
+ if (mid_key < key)
+ lo = mid;
+ else
+ hi = mid;
+ }
+
+ return want_hi ? hi : lo;
+}
+
+int lower_bound(struct btree_node *n, uint64_t key)
+{
+ return bsearch(n, key, 0);
+}
+
+void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
+ struct dm_btree_value_type *vt)
+{
+ unsigned i;
+ uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
+
+ if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
+ for (i = 0; i < nr_entries; i++)
+ dm_tm_inc(tm, value64(n, i));
+ else if (vt->inc)
+ for (i = 0; i < nr_entries; i++)
+ vt->inc(vt->context, value_ptr(n, i));
+}
+
+static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
+ uint64_t key, void *value)
+ __dm_written_to_disk(value)
+{
+ uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
+ __le64 key_le = cpu_to_le64(key);
+
+ if (index > nr_entries ||
+ index >= le32_to_cpu(node->header.max_entries)) {
+ DMERR("too many entries in btree node for insert");
+ __dm_unbless_for_disk(value);
+ return -ENOMEM;
+ }
+
+ __dm_bless_for_disk(&key_le);
+
+ array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
+ array_insert(value_base(node), value_size, nr_entries, index, value);
+ node->header.nr_entries = cpu_to_le32(nr_entries + 1);
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * We want 3n entries (for some n). This works more nicely for repeated
+ * insert remove loops than (2n + 1).
+ */
+static uint32_t calc_max_entries(size_t value_size, size_t block_size)
+{
+ uint32_t total, n;
+ size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
+
+ block_size -= sizeof(struct node_header);
+ total = block_size / elt_size;
+ n = total / 3; /* rounds down */
+
+ return 3 * n;
+}
+
+int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
+{
+ int r;
+ struct dm_block *b;
+ struct btree_node *n;
+ size_t block_size;
+ uint32_t max_entries;
+
+ r = new_block(info, &b);
+ if (r < 0)
+ return r;
+
+ block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
+ max_entries = calc_max_entries(info->value_type.size, block_size);
+
+ n = dm_block_data(b);
+ memset(n, 0, block_size);
+ n->header.flags = cpu_to_le32(LEAF_NODE);
+ n->header.nr_entries = cpu_to_le32(0);
+ n->header.max_entries = cpu_to_le32(max_entries);
+ n->header.value_size = cpu_to_le32(info->value_type.size);
+
+ *root = dm_block_location(b);
+ return unlock_block(info, b);
+}
+EXPORT_SYMBOL_GPL(dm_btree_empty);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Deletion uses a recursive algorithm, since we have limited stack space
+ * we explicitly manage our own stack on the heap.
+ */
+#define MAX_SPINE_DEPTH 64
+struct frame {
+ struct dm_block *b;
+ struct btree_node *n;
+ unsigned level;
+ unsigned nr_children;
+ unsigned current_child;
+};
+
+struct del_stack {
+ struct dm_btree_info *info;
+ struct dm_transaction_manager *tm;
+ int top;
+ struct frame spine[MAX_SPINE_DEPTH];
+};
+
+static int top_frame(struct del_stack *s, struct frame **f)
+{
+ if (s->top < 0) {
+ DMERR("btree deletion stack empty");
+ return -EINVAL;
+ }
+
+ *f = s->spine + s->top;
+
+ return 0;
+}
+
+static int unprocessed_frames(struct del_stack *s)
+{
+ return s->top >= 0;
+}
+
+static void prefetch_children(struct del_stack *s, struct frame *f)
+{
+ unsigned i;
+ struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
+
+ for (i = 0; i < f->nr_children; i++)
+ dm_bm_prefetch(bm, value64(f->n, i));
+}
+
+static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
+{
+ return f->level < (info->levels - 1);
+}
+
+static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
+{
+ int r;
+ uint32_t ref_count;
+
+ if (s->top >= MAX_SPINE_DEPTH - 1) {
+ DMERR("btree deletion stack out of memory");
+ return -ENOMEM;
+ }
+
+ r = dm_tm_ref(s->tm, b, &ref_count);
+ if (r)
+ return r;
+
+ if (ref_count > 1)
+ /*
+ * This is a shared node, so we can just decrement it's
+ * reference counter and leave the children.
+ */
+ dm_tm_dec(s->tm, b);
+
+ else {
+ uint32_t flags;
+ struct frame *f = s->spine + ++s->top;
+
+ r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
+ if (r) {
+ s->top--;
+ return r;
+ }
+
+ f->n = dm_block_data(f->b);
+ f->level = level;
+ f->nr_children = le32_to_cpu(f->n->header.nr_entries);
+ f->current_child = 0;
+
+ flags = le32_to_cpu(f->n->header.flags);
+ if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
+ prefetch_children(s, f);
+ }
+
+ return 0;
+}
+
+static void pop_frame(struct del_stack *s)
+{
+ struct frame *f = s->spine + s->top--;
+
+ dm_tm_dec(s->tm, dm_block_location(f->b));
+ dm_tm_unlock(s->tm, f->b);
+}
+
+int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
+{
+ int r;
+ struct del_stack *s;
+
+ s = kmalloc(sizeof(*s), GFP_NOIO);
+ if (!s)
+ return -ENOMEM;
+ s->info = info;
+ s->tm = info->tm;
+ s->top = -1;
+
+ r = push_frame(s, root, 0);
+ if (r)
+ goto out;
+
+ while (unprocessed_frames(s)) {
+ uint32_t flags;
+ struct frame *f;
+ dm_block_t b;
+
+ r = top_frame(s, &f);
+ if (r)
+ goto out;
+
+ if (f->current_child >= f->nr_children) {
+ pop_frame(s);
+ continue;
+ }
+
+ flags = le32_to_cpu(f->n->header.flags);
+ if (flags & INTERNAL_NODE) {
+ b = value64(f->n, f->current_child);
+ f->current_child++;
+ r = push_frame(s, b, f->level);
+ if (r)
+ goto out;
+
+ } else if (is_internal_level(info, f)) {
+ b = value64(f->n, f->current_child);
+ f->current_child++;
+ r = push_frame(s, b, f->level + 1);
+ if (r)
+ goto out;
+
+ } else {
+ if (info->value_type.dec) {
+ unsigned i;
+
+ for (i = 0; i < f->nr_children; i++)
+ info->value_type.dec(info->value_type.context,
+ value_ptr(f->n, i));
+ }
+ pop_frame(s);
+ }
+ }
+
+out:
+ kfree(s);
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_del);
+
+/*----------------------------------------------------------------*/
+
+static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
+ int (*search_fn)(struct btree_node *, uint64_t),
+ uint64_t *result_key, void *v, size_t value_size)
+{
+ int i, r;
+ uint32_t flags, nr_entries;
+
+ do {
+ r = ro_step(s, block);
+ if (r < 0)
+ return r;
+
+ i = search_fn(ro_node(s), key);
+
+ flags = le32_to_cpu(ro_node(s)->header.flags);
+ nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
+ if (i < 0 || i >= nr_entries)
+ return -ENODATA;
+
+ if (flags & INTERNAL_NODE)
+ block = value64(ro_node(s), i);
+
+ } while (!(flags & LEAF_NODE));
+
+ *result_key = le64_to_cpu(ro_node(s)->keys[i]);
+ memcpy(v, value_ptr(ro_node(s), i), value_size);
+
+ return 0;
+}
+
+int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, void *value_le)
+{
+ unsigned level, last_level = info->levels - 1;
+ int r = -ENODATA;
+ uint64_t rkey;
+ __le64 internal_value_le;
+ struct ro_spine spine;
+
+ init_ro_spine(&spine, info);
+ for (level = 0; level < info->levels; level++) {
+ size_t size;
+ void *value_p;
+
+ if (level == last_level) {
+ value_p = value_le;
+ size = info->value_type.size;
+
+ } else {
+ value_p = &internal_value_le;
+ size = sizeof(uint64_t);
+ }
+
+ r = btree_lookup_raw(&spine, root, keys[level],
+ lower_bound, &rkey,
+ value_p, size);
+
+ if (!r) {
+ if (rkey != keys[level]) {
+ exit_ro_spine(&spine);
+ return -ENODATA;
+ }
+ } else {
+ exit_ro_spine(&spine);
+ return r;
+ }
+
+ root = le64_to_cpu(internal_value_le);
+ }
+ exit_ro_spine(&spine);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_lookup);
+
+/*
+ * Splits a node by creating a sibling node and shifting half the nodes
+ * contents across. Assumes there is a parent node, and it has room for
+ * another child.
+ *
+ * Before:
+ * +--------+
+ * | Parent |
+ * +--------+
+ * |
+ * v
+ * +----------+
+ * | A ++++++ |
+ * +----------+
+ *
+ *
+ * After:
+ * +--------+
+ * | Parent |
+ * +--------+
+ * | |
+ * v +------+
+ * +---------+ |
+ * | A* +++ | v
+ * +---------+ +-------+
+ * | B +++ |
+ * +-------+
+ *
+ * Where A* is a shadow of A.
+ */
+static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
+ unsigned parent_index, uint64_t key)
+{
+ int r;
+ size_t size;
+ unsigned nr_left, nr_right;
+ struct dm_block *left, *right, *parent;
+ struct btree_node *ln, *rn, *pn;
+ __le64 location;
+
+ left = shadow_current(s);
+
+ r = new_block(s->info, &right);
+ if (r < 0)
+ return r;
+
+ ln = dm_block_data(left);
+ rn = dm_block_data(right);
+
+ nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
+ nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
+
+ ln->header.nr_entries = cpu_to_le32(nr_left);
+
+ rn->header.flags = ln->header.flags;
+ rn->header.nr_entries = cpu_to_le32(nr_right);
+ rn->header.max_entries = ln->header.max_entries;
+ rn->header.value_size = ln->header.value_size;
+ memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
+
+ size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
+ sizeof(uint64_t) : s->info->value_type.size;
+ memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
+ size * nr_right);
+
+ /*
+ * Patch up the parent
+ */
+ parent = shadow_parent(s);
+
+ pn = dm_block_data(parent);
+ location = cpu_to_le64(dm_block_location(left));
+ __dm_bless_for_disk(&location);
+ memcpy_disk(value_ptr(pn, parent_index),
+ &location, sizeof(__le64));
+
+ location = cpu_to_le64(dm_block_location(right));
+ __dm_bless_for_disk(&location);
+
+ r = insert_at(sizeof(__le64), pn, parent_index + 1,
+ le64_to_cpu(rn->keys[0]), &location);
+ if (r)
+ return r;
+
+ if (key < le64_to_cpu(rn->keys[0])) {
+ unlock_block(s->info, right);
+ s->nodes[1] = left;
+ } else {
+ unlock_block(s->info, left);
+ s->nodes[1] = right;
+ }
+
+ return 0;
+}
+
+/*
+ * Splits a node by creating two new children beneath the given node.
+ *
+ * Before:
+ * +----------+
+ * | A ++++++ |
+ * +----------+
+ *
+ *
+ * After:
+ * +------------+
+ * | A (shadow) |
+ * +------------+
+ * | |
+ * +------+ +----+
+ * | |
+ * v v
+ * +-------+ +-------+
+ * | B +++ | | C +++ |
+ * +-------+ +-------+
+ */
+static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
+{
+ int r;
+ size_t size;
+ unsigned nr_left, nr_right;
+ struct dm_block *left, *right, *new_parent;
+ struct btree_node *pn, *ln, *rn;
+ __le64 val;
+
+ new_parent = shadow_current(s);
+
+ r = new_block(s->info, &left);
+ if (r < 0)
+ return r;
+
+ r = new_block(s->info, &right);
+ if (r < 0) {
+ /* FIXME: put left */
+ return r;
+ }
+
+ pn = dm_block_data(new_parent);
+ ln = dm_block_data(left);
+ rn = dm_block_data(right);
+
+ nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
+ nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
+
+ ln->header.flags = pn->header.flags;
+ ln->header.nr_entries = cpu_to_le32(nr_left);
+ ln->header.max_entries = pn->header.max_entries;
+ ln->header.value_size = pn->header.value_size;
+
+ rn->header.flags = pn->header.flags;
+ rn->header.nr_entries = cpu_to_le32(nr_right);
+ rn->header.max_entries = pn->header.max_entries;
+ rn->header.value_size = pn->header.value_size;
+
+ memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
+ memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
+
+ size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
+ sizeof(__le64) : s->info->value_type.size;
+ memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+ memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
+ nr_right * size);
+
+ /* new_parent should just point to l and r now */
+ pn->header.flags = cpu_to_le32(INTERNAL_NODE);
+ pn->header.nr_entries = cpu_to_le32(2);
+ pn->header.max_entries = cpu_to_le32(
+ calc_max_entries(sizeof(__le64),
+ dm_bm_block_size(
+ dm_tm_get_bm(s->info->tm))));
+ pn->header.value_size = cpu_to_le32(sizeof(__le64));
+
+ val = cpu_to_le64(dm_block_location(left));
+ __dm_bless_for_disk(&val);
+ pn->keys[0] = ln->keys[0];
+ memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
+
+ val = cpu_to_le64(dm_block_location(right));
+ __dm_bless_for_disk(&val);
+ pn->keys[1] = rn->keys[0];
+ memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
+
+ /*
+ * rejig the spine. This is ugly, since it knows too
+ * much about the spine
+ */
+ if (s->nodes[0] != new_parent) {
+ unlock_block(s->info, s->nodes[0]);
+ s->nodes[0] = new_parent;
+ }
+ if (key < le64_to_cpu(rn->keys[0])) {
+ unlock_block(s->info, right);
+ s->nodes[1] = left;
+ } else {
+ unlock_block(s->info, left);
+ s->nodes[1] = right;
+ }
+ s->count = 2;
+
+ return 0;
+}
+
+static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
+ struct dm_btree_value_type *vt,
+ uint64_t key, unsigned *index)
+{
+ int r, i = *index, top = 1;
+ struct btree_node *node;
+
+ for (;;) {
+ r = shadow_step(s, root, vt);
+ if (r < 0)
+ return r;
+
+ node = dm_block_data(shadow_current(s));
+
+ /*
+ * We have to patch up the parent node, ugly, but I don't
+ * see a way to do this automatically as part of the spine
+ * op.
+ */
+ if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
+ __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
+
+ __dm_bless_for_disk(&location);
+ memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
+ &location, sizeof(__le64));
+ }
+
+ node = dm_block_data(shadow_current(s));
+
+ if (node->header.nr_entries == node->header.max_entries) {
+ if (top)
+ r = btree_split_beneath(s, key);
+ else
+ r = btree_split_sibling(s, root, i, key);
+
+ if (r < 0)
+ return r;
+ }
+
+ node = dm_block_data(shadow_current(s));
+
+ i = lower_bound(node, key);
+
+ if (le32_to_cpu(node->header.flags) & LEAF_NODE)
+ break;
+
+ if (i < 0) {
+ /* change the bounds on the lowest key */
+ node->keys[0] = cpu_to_le64(key);
+ i = 0;
+ }
+
+ root = value64(node, i);
+ top = 0;
+ }
+
+ if (i < 0 || le64_to_cpu(node->keys[i]) != key)
+ i++;
+
+ *index = i;
+ return 0;
+}
+
+static int insert(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, void *value, dm_block_t *new_root,
+ int *inserted)
+ __dm_written_to_disk(value)
+{
+ int r, need_insert;
+ unsigned level, index = -1, last_level = info->levels - 1;
+ dm_block_t block = root;
+ struct shadow_spine spine;
+ struct btree_node *n;
+ struct dm_btree_value_type le64_type;
+
+ le64_type.context = NULL;
+ le64_type.size = sizeof(__le64);
+ le64_type.inc = NULL;
+ le64_type.dec = NULL;
+ le64_type.equal = NULL;
+
+ init_shadow_spine(&spine, info);
+
+ for (level = 0; level < (info->levels - 1); level++) {
+ r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
+ if (r < 0)
+ goto bad;
+
+ n = dm_block_data(shadow_current(&spine));
+ need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
+ (le64_to_cpu(n->keys[index]) != keys[level]));
+
+ if (need_insert) {
+ dm_block_t new_tree;
+ __le64 new_le;
+
+ r = dm_btree_empty(info, &new_tree);
+ if (r < 0)
+ goto bad;
+
+ new_le = cpu_to_le64(new_tree);
+ __dm_bless_for_disk(&new_le);
+
+ r = insert_at(sizeof(uint64_t), n, index,
+ keys[level], &new_le);
+ if (r)
+ goto bad;
+ }
+
+ if (level < last_level)
+ block = value64(n, index);
+ }
+
+ r = btree_insert_raw(&spine, block, &info->value_type,
+ keys[level], &index);
+ if (r < 0)
+ goto bad;
+
+ n = dm_block_data(shadow_current(&spine));
+ need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
+ (le64_to_cpu(n->keys[index]) != keys[level]));
+
+ if (need_insert) {
+ if (inserted)
+ *inserted = 1;
+
+ r = insert_at(info->value_type.size, n, index,
+ keys[level], value);
+ if (r)
+ goto bad_unblessed;
+ } else {
+ if (inserted)
+ *inserted = 0;
+
+ if (info->value_type.dec &&
+ (!info->value_type.equal ||
+ !info->value_type.equal(
+ info->value_type.context,
+ value_ptr(n, index),
+ value))) {
+ info->value_type.dec(info->value_type.context,
+ value_ptr(n, index));
+ }
+ memcpy_disk(value_ptr(n, index),
+ value, info->value_type.size);
+ }
+
+ *new_root = shadow_root(&spine);
+ exit_shadow_spine(&spine);
+
+ return 0;
+
+bad:
+ __dm_unbless_for_disk(value);
+bad_unblessed:
+ exit_shadow_spine(&spine);
+ return r;
+}
+
+int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value)
+{
+ return insert(info, root, keys, value, new_root, NULL);
+}
+EXPORT_SYMBOL_GPL(dm_btree_insert);
+
+int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, void *value, dm_block_t *new_root,
+ int *inserted)
+ __dm_written_to_disk(value)
+{
+ return insert(info, root, keys, value, new_root, inserted);
+}
+EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
+
+/*----------------------------------------------------------------*/
+
+static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
+ uint64_t *result_key, dm_block_t *next_block)
+{
+ int i, r;
+ uint32_t flags;
+
+ do {
+ r = ro_step(s, block);
+ if (r < 0)
+ return r;
+
+ flags = le32_to_cpu(ro_node(s)->header.flags);
+ i = le32_to_cpu(ro_node(s)->header.nr_entries);
+ if (!i)
+ return -ENODATA;
+ else
+ i--;
+
+ if (find_highest)
+ *result_key = le64_to_cpu(ro_node(s)->keys[i]);
+ else
+ *result_key = le64_to_cpu(ro_node(s)->keys[0]);
+
+ if (next_block || flags & INTERNAL_NODE)
+ block = value64(ro_node(s), i);
+
+ } while (flags & INTERNAL_NODE);
+
+ if (next_block)
+ *next_block = block;
+ return 0;
+}
+
+static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
+ bool find_highest, uint64_t *result_keys)
+{
+ int r = 0, count = 0, level;
+ struct ro_spine spine;
+
+ init_ro_spine(&spine, info);
+ for (level = 0; level < info->levels; level++) {
+ r = find_key(&spine, root, find_highest, result_keys + level,
+ level == info->levels - 1 ? NULL : &root);
+ if (r == -ENODATA) {
+ r = 0;
+ break;
+
+ } else if (r)
+ break;
+
+ count++;
+ }
+ exit_ro_spine(&spine);
+
+ return r ? r : count;
+}
+
+int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *result_keys)
+{
+ return dm_btree_find_key(info, root, true, result_keys);
+}
+EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
+
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *result_keys)
+{
+ return dm_btree_find_key(info, root, false, result_keys);
+}
+EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * FIXME: We shouldn't use a recursive algorithm when we have limited stack
+ * space. Also this only works for single level trees.
+ */
+static int walk_node(struct dm_btree_info *info, dm_block_t block,
+ int (*fn)(void *context, uint64_t *keys, void *leaf),
+ void *context)
+{
+ int r;
+ unsigned i, nr;
+ struct dm_block *node;
+ struct btree_node *n;
+ uint64_t keys;
+
+ r = bn_read_lock(info, block, &node);
+ if (r)
+ return r;
+
+ n = dm_block_data(node);
+
+ nr = le32_to_cpu(n->header.nr_entries);
+ for (i = 0; i < nr; i++) {
+ if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
+ r = walk_node(info, value64(n, i), fn, context);
+ if (r)
+ goto out;
+ } else {
+ keys = le64_to_cpu(*key_ptr(n, i));
+ r = fn(context, &keys, value_ptr(n, i));
+ if (r)
+ goto out;
+ }
+ }
+
+out:
+ dm_tm_unlock(info->tm, node);
+ return r;
+}
+
+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
+ int (*fn)(void *context, uint64_t *keys, void *leaf),
+ void *context)
+{
+ BUG_ON(info->levels > 1);
+ return walk_node(info, root, fn, context);
+}
+EXPORT_SYMBOL_GPL(dm_btree_walk);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
new file mode 100644
index 000000000..dacfc3418
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef _LINUX_DM_BTREE_H
+#define _LINUX_DM_BTREE_H
+
+#include "dm-block-manager.h"
+
+struct dm_transaction_manager;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Annotations used to check on-disk metadata is handled as little-endian.
+ */
+#ifdef __CHECKER__
+# define __dm_written_to_disk(x) __releases(x)
+# define __dm_reads_from_disk(x) __acquires(x)
+# define __dm_bless_for_disk(x) __acquire(x)
+# define __dm_unbless_for_disk(x) __release(x)
+#else
+# define __dm_written_to_disk(x)
+# define __dm_reads_from_disk(x)
+# define __dm_bless_for_disk(x)
+# define __dm_unbless_for_disk(x)
+#endif
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized
+ * values.
+ */
+
+/*
+ * Information about the values stored within the btree.
+ */
+struct dm_btree_value_type {
+ void *context;
+
+ /*
+ * The size in bytes of each value.
+ */
+ uint32_t size;
+
+ /*
+ * Any of these methods can be safely set to NULL if you do not
+ * need the corresponding feature.
+ */
+
+ /*
+ * The btree is making a duplicate of the value, for instance
+ * because previously-shared btree nodes have now diverged.
+ * @value argument is the new copy that the copy function may modify.
+ * (Probably it just wants to increment a reference count
+ * somewhere.) This method is _not_ called for insertion of a new
+ * value: It is assumed the ref count is already 1.
+ */
+ void (*inc)(void *context, const void *value);
+
+ /*
+ * This value is being deleted. The btree takes care of freeing
+ * the memory pointed to by @value. Often the del function just
+ * needs to decrement a reference count somewhere.
+ */
+ void (*dec)(void *context, const void *value);
+
+ /*
+ * A test for equality between two values. When a value is
+ * overwritten with a new one, the old one has the dec method
+ * called _unless_ the new and old value are deemed equal.
+ */
+ int (*equal)(void *context, const void *value1, const void *value2);
+};
+
+/*
+ * The shape and contents of a btree.
+ */
+struct dm_btree_info {
+ struct dm_transaction_manager *tm;
+
+ /*
+ * Number of nested btrees. (Not the depth of a single tree.)
+ */
+ unsigned levels;
+ struct dm_btree_value_type value_type;
+};
+
+/*
+ * Set up an empty tree. O(1).
+ */
+int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root);
+
+/*
+ * Delete a tree. O(n) - this is the slow one! It can also block, so
+ * please don't call it on an IO path.
+ */
+int dm_btree_del(struct dm_btree_info *info, dm_block_t root);
+
+/*
+ * All the lookup functions return -ENODATA if the key cannot be found.
+ */
+
+/*
+ * Tries to find a key that matches exactly. O(ln(n))
+ */
+int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, void *value_le);
+
+/*
+ * Insertion (or overwrite an existing value). O(ln(n))
+ */
+int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, void *value, dm_block_t *new_root)
+ __dm_written_to_disk(value);
+
+/*
+ * A variant of insert that indicates whether it actually inserted or just
+ * overwrote. Useful if you're keeping track of the number of entries in a
+ * tree.
+ */
+int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, void *value, dm_block_t *new_root,
+ int *inserted)
+ __dm_written_to_disk(value);
+
+/*
+ * Remove a key if present. This doesn't remove empty sub trees. Normally
+ * subtrees represent a separate entity, like a snapshot map, so this is
+ * correct behaviour. O(ln(n)).
+ */
+int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, dm_block_t *new_root);
+
+/*
+ * Returns < 0 on failure. Otherwise the number of key entries that have
+ * been filled out. Remember trees can have zero entries, and as such have
+ * no lowest key.
+ */
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *result_keys);
+
+/*
+ * Returns < 0 on failure. Otherwise the number of key entries that have
+ * been filled out. Remember trees can have zero entries, and as such have
+ * no highest key.
+ */
+int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *result_keys);
+
+/*
+ * Iterate through the a btree, calling fn() on each entry.
+ * It only works for single level trees and is internally recursive, so
+ * monitor stack usage carefully.
+ */
+int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
+ int (*fn)(void *context, uint64_t *keys, void *leaf),
+ void *context);
+
+#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-persistent-data-internal.h b/drivers/md/persistent-data/dm-persistent-data-internal.h
new file mode 100644
index 000000000..c49e26fff
--- /dev/null
+++ b/drivers/md/persistent-data/dm-persistent-data-internal.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _DM_PERSISTENT_DATA_INTERNAL_H
+#define _DM_PERSISTENT_DATA_INTERNAL_H
+
+#include "dm-block-manager.h"
+
+static inline unsigned dm_hash_block(dm_block_t b, unsigned hash_mask)
+{
+ const unsigned BIG_PRIME = 4294967291UL;
+
+ return (((unsigned) b) * BIG_PRIME) & hash_mask;
+}
+
+#endif /* _PERSISTENT_DATA_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
new file mode 100644
index 000000000..aacbe70c2
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -0,0 +1,751 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-space-map-common.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/bitops.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "space map common"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Index validator.
+ */
+#define INDEX_CSUM_XOR 160478
+
+static void index_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size)
+{
+ struct disk_metadata_index *mi_le = dm_block_data(b);
+
+ mi_le->blocknr = cpu_to_le64(dm_block_location(b));
+ mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
+ block_size - sizeof(__le32),
+ INDEX_CSUM_XOR));
+}
+
+static int index_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size)
+{
+ struct disk_metadata_index *mi_le = dm_block_data(b);
+ __le32 csum_disk;
+
+ if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
+ DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu",
+ le64_to_cpu(mi_le->blocknr), dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
+ block_size - sizeof(__le32),
+ INDEX_CSUM_XOR));
+ if (csum_disk != mi_le->csum) {
+ DMERR_LIMIT("index_check failed: csum %u != wanted %u",
+ le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+static struct dm_block_validator index_validator = {
+ .name = "index",
+ .prepare_for_write = index_prepare_for_write,
+ .check = index_check
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Bitmap validator
+ */
+#define BITMAP_CSUM_XOR 240779
+
+static void bitmap_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size)
+{
+ struct disk_bitmap_header *disk_header = dm_block_data(b);
+
+ disk_header->blocknr = cpu_to_le64(dm_block_location(b));
+ disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
+ block_size - sizeof(__le32),
+ BITMAP_CSUM_XOR));
+}
+
+static int bitmap_check(struct dm_block_validator *v,
+ struct dm_block *b,
+ size_t block_size)
+{
+ struct disk_bitmap_header *disk_header = dm_block_data(b);
+ __le32 csum_disk;
+
+ if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
+ DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu",
+ le64_to_cpu(disk_header->blocknr), dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
+ block_size - sizeof(__le32),
+ BITMAP_CSUM_XOR));
+ if (csum_disk != disk_header->csum) {
+ DMERR_LIMIT("bitmap check failed: csum %u != wanted %u",
+ le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+static struct dm_block_validator dm_sm_bitmap_validator = {
+ .name = "sm_bitmap",
+ .prepare_for_write = bitmap_prepare_for_write,
+ .check = bitmap_check
+};
+
+/*----------------------------------------------------------------*/
+
+#define ENTRIES_PER_WORD 32
+#define ENTRIES_SHIFT 5
+
+static void *dm_bitmap_data(struct dm_block *b)
+{
+ return dm_block_data(b) + sizeof(struct disk_bitmap_header);
+}
+
+#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL
+
+static unsigned bitmap_word_used(void *addr, unsigned b)
+{
+ __le64 *words_le = addr;
+ __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
+
+ uint64_t bits = le64_to_cpu(*w_le);
+ uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH;
+
+ return !(~bits & mask);
+}
+
+static unsigned sm_lookup_bitmap(void *addr, unsigned b)
+{
+ __le64 *words_le = addr;
+ __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
+ unsigned hi, lo;
+
+ b = (b & (ENTRIES_PER_WORD - 1)) << 1;
+ hi = !!test_bit_le(b, (void *) w_le);
+ lo = !!test_bit_le(b + 1, (void *) w_le);
+ return (hi << 1) | lo;
+}
+
+static void sm_set_bitmap(void *addr, unsigned b, unsigned val)
+{
+ __le64 *words_le = addr;
+ __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
+
+ b = (b & (ENTRIES_PER_WORD - 1)) << 1;
+
+ if (val & 2)
+ __set_bit_le(b, (void *) w_le);
+ else
+ __clear_bit_le(b, (void *) w_le);
+
+ if (val & 1)
+ __set_bit_le(b + 1, (void *) w_le);
+ else
+ __clear_bit_le(b + 1, (void *) w_le);
+}
+
+static int sm_find_free(void *addr, unsigned begin, unsigned end,
+ unsigned *result)
+{
+ while (begin < end) {
+ if (!(begin & (ENTRIES_PER_WORD - 1)) &&
+ bitmap_word_used(addr, begin)) {
+ begin += ENTRIES_PER_WORD;
+ continue;
+ }
+
+ if (!sm_lookup_bitmap(addr, begin)) {
+ *result = begin;
+ return 0;
+ }
+
+ begin++;
+ }
+
+ return -ENOSPC;
+}
+
+/*----------------------------------------------------------------*/
+
+static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
+{
+ ll->tm = tm;
+
+ ll->bitmap_info.tm = tm;
+ ll->bitmap_info.levels = 1;
+
+ /*
+ * Because the new bitmap blocks are created via a shadow
+ * operation, the old entry has already had its reference count
+ * decremented and we don't need the btree to do any bookkeeping.
+ */
+ ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry);
+ ll->bitmap_info.value_type.inc = NULL;
+ ll->bitmap_info.value_type.dec = NULL;
+ ll->bitmap_info.value_type.equal = NULL;
+
+ ll->ref_count_info.tm = tm;
+ ll->ref_count_info.levels = 1;
+ ll->ref_count_info.value_type.size = sizeof(uint32_t);
+ ll->ref_count_info.value_type.inc = NULL;
+ ll->ref_count_info.value_type.dec = NULL;
+ ll->ref_count_info.value_type.equal = NULL;
+
+ ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm));
+
+ if (ll->block_size > (1 << 30)) {
+ DMERR("block size too big to hold bitmaps");
+ return -EINVAL;
+ }
+
+ ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) *
+ ENTRIES_PER_BYTE;
+ ll->nr_blocks = 0;
+ ll->bitmap_root = 0;
+ ll->ref_count_root = 0;
+ ll->bitmap_index_changed = false;
+
+ return 0;
+}
+
+int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
+{
+ int r;
+ dm_block_t i, nr_blocks, nr_indexes;
+ unsigned old_blocks, blocks;
+
+ nr_blocks = ll->nr_blocks + extra_blocks;
+ old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block);
+ blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block);
+
+ nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block);
+ if (nr_indexes > ll->max_entries(ll)) {
+ DMERR("space map too large");
+ return -EINVAL;
+ }
+
+ /*
+ * We need to set this before the dm_tm_new_block() call below.
+ */
+ ll->nr_blocks = nr_blocks;
+ for (i = old_blocks; i < blocks; i++) {
+ struct dm_block *b;
+ struct disk_index_entry idx;
+
+ r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
+ if (r < 0)
+ return r;
+
+ idx.blocknr = cpu_to_le64(dm_block_location(b));
+
+ r = dm_tm_unlock(ll->tm, b);
+ if (r < 0)
+ return r;
+
+ idx.nr_free = cpu_to_le32(ll->entries_per_block);
+ idx.none_free_before = 0;
+
+ r = ll->save_ie(ll, i, &idx);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
+{
+ int r;
+ dm_block_t index = b;
+ struct disk_index_entry ie_disk;
+ struct dm_block *blk;
+
+ b = do_div(index, ll->entries_per_block);
+ r = ll->load_ie(ll, index, &ie_disk);
+ if (r < 0)
+ return r;
+
+ r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
+ &dm_sm_bitmap_validator, &blk);
+ if (r < 0)
+ return r;
+
+ *result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
+
+ return dm_tm_unlock(ll->tm, blk);
+}
+
+static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b,
+ uint32_t *result)
+{
+ __le32 le_rc;
+ int r;
+
+ r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
+ if (r < 0)
+ return r;
+
+ *result = le32_to_cpu(le_rc);
+
+ return r;
+}
+
+int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
+{
+ int r = sm_ll_lookup_bitmap(ll, b, result);
+
+ if (r)
+ return r;
+
+ if (*result != 3)
+ return r;
+
+ return sm_ll_lookup_big_ref_count(ll, b, result);
+}
+
+int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
+ dm_block_t end, dm_block_t *result)
+{
+ int r;
+ struct disk_index_entry ie_disk;
+ dm_block_t i, index_begin = begin;
+ dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block);
+
+ /*
+ * FIXME: Use shifts
+ */
+ begin = do_div(index_begin, ll->entries_per_block);
+ end = do_div(end, ll->entries_per_block);
+
+ for (i = index_begin; i < index_end; i++, begin = 0) {
+ struct dm_block *blk;
+ unsigned position;
+ uint32_t bit_end;
+
+ r = ll->load_ie(ll, i, &ie_disk);
+ if (r < 0)
+ return r;
+
+ if (le32_to_cpu(ie_disk.nr_free) == 0)
+ continue;
+
+ r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
+ &dm_sm_bitmap_validator, &blk);
+ if (r < 0)
+ return r;
+
+ bit_end = (i == index_end - 1) ? end : ll->entries_per_block;
+
+ r = sm_find_free(dm_bitmap_data(blk),
+ max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)),
+ bit_end, &position);
+ if (r == -ENOSPC) {
+ /*
+ * This might happen because we started searching
+ * part way through the bitmap.
+ */
+ dm_tm_unlock(ll->tm, blk);
+ continue;
+
+ } else if (r < 0) {
+ dm_tm_unlock(ll->tm, blk);
+ return r;
+ }
+
+ r = dm_tm_unlock(ll->tm, blk);
+ if (r < 0)
+ return r;
+
+ *result = i * ll->entries_per_block + (dm_block_t) position;
+ return 0;
+ }
+
+ return -ENOSPC;
+}
+
+static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
+ int (*mutator)(void *context, uint32_t old, uint32_t *new),
+ void *context, enum allocation_event *ev)
+{
+ int r;
+ uint32_t bit, old, ref_count;
+ struct dm_block *nb;
+ dm_block_t index = b;
+ struct disk_index_entry ie_disk;
+ void *bm_le;
+ int inc;
+
+ bit = do_div(index, ll->entries_per_block);
+ r = ll->load_ie(ll, index, &ie_disk);
+ if (r < 0)
+ return r;
+
+ r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr),
+ &dm_sm_bitmap_validator, &nb, &inc);
+ if (r < 0) {
+ DMERR("dm_tm_shadow_block() failed");
+ return r;
+ }
+ ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
+
+ bm_le = dm_bitmap_data(nb);
+ old = sm_lookup_bitmap(bm_le, bit);
+
+ if (old > 2) {
+ r = sm_ll_lookup_big_ref_count(ll, b, &old);
+ if (r < 0) {
+ dm_tm_unlock(ll->tm, nb);
+ return r;
+ }
+ }
+
+ r = mutator(context, old, &ref_count);
+ if (r) {
+ dm_tm_unlock(ll->tm, nb);
+ return r;
+ }
+
+ if (ref_count <= 2) {
+ sm_set_bitmap(bm_le, bit, ref_count);
+
+ r = dm_tm_unlock(ll->tm, nb);
+ if (r < 0)
+ return r;
+
+ if (old > 2) {
+ r = dm_btree_remove(&ll->ref_count_info,
+ ll->ref_count_root,
+ &b, &ll->ref_count_root);
+ if (r)
+ return r;
+ }
+
+ } else {
+ __le32 le_rc = cpu_to_le32(ref_count);
+
+ sm_set_bitmap(bm_le, bit, 3);
+ r = dm_tm_unlock(ll->tm, nb);
+ if (r < 0)
+ return r;
+
+ __dm_bless_for_disk(&le_rc);
+ r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
+ &b, &le_rc, &ll->ref_count_root);
+ if (r < 0) {
+ DMERR("ref count insert failed");
+ return r;
+ }
+ }
+
+ if (ref_count && !old) {
+ *ev = SM_ALLOC;
+ ll->nr_allocated++;
+ le32_add_cpu(&ie_disk.nr_free, -1);
+ if (le32_to_cpu(ie_disk.none_free_before) == bit)
+ ie_disk.none_free_before = cpu_to_le32(bit + 1);
+
+ } else if (old && !ref_count) {
+ *ev = SM_FREE;
+ ll->nr_allocated--;
+ le32_add_cpu(&ie_disk.nr_free, 1);
+ ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
+ }
+
+ return ll->save_ie(ll, index, &ie_disk);
+}
+
+static int set_ref_count(void *context, uint32_t old, uint32_t *new)
+{
+ *new = *((uint32_t *) context);
+ return 0;
+}
+
+int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
+ uint32_t ref_count, enum allocation_event *ev)
+{
+ return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
+}
+
+static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
+{
+ *new = old + 1;
+ return 0;
+}
+
+int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+{
+ return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
+}
+
+static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
+{
+ if (!old) {
+ DMERR_LIMIT("unable to decrement a reference count below 0");
+ return -EINVAL;
+ }
+
+ *new = old - 1;
+ return 0;
+}
+
+int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+{
+ return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev);
+}
+
+int sm_ll_commit(struct ll_disk *ll)
+{
+ int r = 0;
+
+ if (ll->bitmap_index_changed) {
+ r = ll->commit(ll);
+ if (!r)
+ ll->bitmap_index_changed = false;
+ }
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
+ struct disk_index_entry *ie)
+{
+ memcpy(ie, ll->mi_le.index + index, sizeof(*ie));
+ return 0;
+}
+
+static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
+ struct disk_index_entry *ie)
+{
+ ll->bitmap_index_changed = true;
+ memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
+ return 0;
+}
+
+static int metadata_ll_init_index(struct ll_disk *ll)
+{
+ int r;
+ struct dm_block *b;
+
+ r = dm_tm_new_block(ll->tm, &index_validator, &b);
+ if (r < 0)
+ return r;
+
+ memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
+ ll->bitmap_root = dm_block_location(b);
+
+ return dm_tm_unlock(ll->tm, b);
+}
+
+static int metadata_ll_open(struct ll_disk *ll)
+{
+ int r;
+ struct dm_block *block;
+
+ r = dm_tm_read_lock(ll->tm, ll->bitmap_root,
+ &index_validator, &block);
+ if (r)
+ return r;
+
+ memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
+ return dm_tm_unlock(ll->tm, block);
+}
+
+static dm_block_t metadata_ll_max_entries(struct ll_disk *ll)
+{
+ return MAX_METADATA_BITMAPS;
+}
+
+static int metadata_ll_commit(struct ll_disk *ll)
+{
+ int r, inc;
+ struct dm_block *b;
+
+ r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc);
+ if (r)
+ return r;
+
+ memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
+ ll->bitmap_root = dm_block_location(b);
+
+ return dm_tm_unlock(ll->tm, b);
+}
+
+int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm)
+{
+ int r;
+
+ r = sm_ll_init(ll, tm);
+ if (r < 0)
+ return r;
+
+ ll->load_ie = metadata_ll_load_ie;
+ ll->save_ie = metadata_ll_save_ie;
+ ll->init_index = metadata_ll_init_index;
+ ll->open_index = metadata_ll_open;
+ ll->max_entries = metadata_ll_max_entries;
+ ll->commit = metadata_ll_commit;
+
+ ll->nr_blocks = 0;
+ ll->nr_allocated = 0;
+
+ r = ll->init_index(ll);
+ if (r < 0)
+ return r;
+
+ r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
+ void *root_le, size_t len)
+{
+ int r;
+ struct disk_sm_root *smr = root_le;
+
+ if (len < sizeof(struct disk_sm_root)) {
+ DMERR("sm_metadata root too small");
+ return -ENOMEM;
+ }
+
+ r = sm_ll_init(ll, tm);
+ if (r < 0)
+ return r;
+
+ ll->load_ie = metadata_ll_load_ie;
+ ll->save_ie = metadata_ll_save_ie;
+ ll->init_index = metadata_ll_init_index;
+ ll->open_index = metadata_ll_open;
+ ll->max_entries = metadata_ll_max_entries;
+ ll->commit = metadata_ll_commit;
+
+ ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
+ ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
+ ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
+ ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
+
+ return ll->open_index(ll);
+}
+
+/*----------------------------------------------------------------*/
+
+static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index,
+ struct disk_index_entry *ie)
+{
+ return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
+}
+
+static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index,
+ struct disk_index_entry *ie)
+{
+ __dm_bless_for_disk(ie);
+ return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
+ &index, ie, &ll->bitmap_root);
+}
+
+static int disk_ll_init_index(struct ll_disk *ll)
+{
+ return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root);
+}
+
+static int disk_ll_open(struct ll_disk *ll)
+{
+ /* nothing to do */
+ return 0;
+}
+
+static dm_block_t disk_ll_max_entries(struct ll_disk *ll)
+{
+ return -1ULL;
+}
+
+static int disk_ll_commit(struct ll_disk *ll)
+{
+ return 0;
+}
+
+int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm)
+{
+ int r;
+
+ r = sm_ll_init(ll, tm);
+ if (r < 0)
+ return r;
+
+ ll->load_ie = disk_ll_load_ie;
+ ll->save_ie = disk_ll_save_ie;
+ ll->init_index = disk_ll_init_index;
+ ll->open_index = disk_ll_open;
+ ll->max_entries = disk_ll_max_entries;
+ ll->commit = disk_ll_commit;
+
+ ll->nr_blocks = 0;
+ ll->nr_allocated = 0;
+
+ r = ll->init_index(ll);
+ if (r < 0)
+ return r;
+
+ r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
+ void *root_le, size_t len)
+{
+ int r;
+ struct disk_sm_root *smr = root_le;
+
+ if (len < sizeof(struct disk_sm_root)) {
+ DMERR("sm_metadata root too small");
+ return -ENOMEM;
+ }
+
+ r = sm_ll_init(ll, tm);
+ if (r < 0)
+ return r;
+
+ ll->load_ie = disk_ll_load_ie;
+ ll->save_ie = disk_ll_save_ie;
+ ll->init_index = disk_ll_init_index;
+ ll->open_index = disk_ll_open;
+ ll->max_entries = disk_ll_max_entries;
+ ll->commit = disk_ll_commit;
+
+ ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
+ ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
+ ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
+ ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
+
+ return ll->open_index(ll);
+}
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
new file mode 100644
index 000000000..b3078d5ed
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_SPACE_MAP_COMMON_H
+#define DM_SPACE_MAP_COMMON_H
+
+#include "dm-btree.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Low level disk format
+ *
+ * Bitmap btree
+ * ------------
+ *
+ * Each value stored in the btree is an index_entry. This points to a
+ * block that is used as a bitmap. Within the bitmap hold 2 bits per
+ * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and
+ * REF_COUNT = many.
+ *
+ * Refcount btree
+ * --------------
+ *
+ * Any entry that has a ref count higher than 2 gets entered in the ref
+ * count tree. The leaf values for this tree is the 32-bit ref count.
+ */
+
+struct disk_index_entry {
+ __le64 blocknr;
+ __le32 nr_free;
+ __le32 none_free_before;
+} __packed;
+
+
+#define MAX_METADATA_BITMAPS 255
+struct disk_metadata_index {
+ __le32 csum;
+ __le32 padding;
+ __le64 blocknr;
+
+ struct disk_index_entry index[MAX_METADATA_BITMAPS];
+} __packed;
+
+struct ll_disk;
+
+typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result);
+typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie);
+typedef int (*init_index_fn)(struct ll_disk *ll);
+typedef int (*open_index_fn)(struct ll_disk *ll);
+typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
+typedef int (*commit_fn)(struct ll_disk *ll);
+
+struct ll_disk {
+ struct dm_transaction_manager *tm;
+ struct dm_btree_info bitmap_info;
+ struct dm_btree_info ref_count_info;
+
+ uint32_t block_size;
+ uint32_t entries_per_block;
+ dm_block_t nr_blocks;
+ dm_block_t nr_allocated;
+
+ /*
+ * bitmap_root may be a btree root or a simple index.
+ */
+ dm_block_t bitmap_root;
+
+ dm_block_t ref_count_root;
+
+ struct disk_metadata_index mi_le;
+ load_ie_fn load_ie;
+ save_ie_fn save_ie;
+ init_index_fn init_index;
+ open_index_fn open_index;
+ max_index_entries_fn max_entries;
+ commit_fn commit;
+ bool bitmap_index_changed:1;
+};
+
+struct disk_sm_root {
+ __le64 nr_blocks;
+ __le64 nr_allocated;
+ __le64 bitmap_root;
+ __le64 ref_count_root;
+} __packed;
+
+#define ENTRIES_PER_BYTE 4
+
+struct disk_bitmap_header {
+ __le32 csum;
+ __le32 not_used;
+ __le64 blocknr;
+} __packed;
+
+enum allocation_event {
+ SM_NONE,
+ SM_ALLOC,
+ SM_FREE,
+};
+
+/*----------------------------------------------------------------*/
+
+int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
+int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
+int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
+int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
+ dm_block_t end, dm_block_t *result);
+int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
+int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
+int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
+int sm_ll_commit(struct ll_disk *ll);
+
+int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
+int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
+ void *root_le, size_t len);
+
+int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm);
+int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
+ void *root_le, size_t len);
+
+/*----------------------------------------------------------------*/
+
+#endif /* DM_SPACE_MAP_COMMON_H */
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
new file mode 100644
index 000000000..ebb280a14
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -0,0 +1,305 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-space-map-common.h"
+#include "dm-space-map-disk.h"
+#include "dm-space-map.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "space map disk"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Space map interface.
+ */
+struct sm_disk {
+ struct dm_space_map sm;
+
+ struct ll_disk ll;
+ struct ll_disk old_ll;
+
+ dm_block_t begin;
+ dm_block_t nr_allocated_this_transaction;
+};
+
+static void sm_disk_destroy(struct dm_space_map *sm)
+{
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+
+ kfree(smd);
+}
+
+static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+{
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+
+ return sm_ll_extend(&smd->ll, extra_blocks);
+}
+
+static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
+{
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+ *count = smd->old_ll.nr_blocks;
+
+ return 0;
+}
+
+static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
+{
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+ *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction;
+
+ return 0;
+}
+
+static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b,
+ uint32_t *result)
+{
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+ return sm_ll_lookup(&smd->ll, b, result);
+}
+
+static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
+ int *result)
+{
+ int r;
+ uint32_t count;
+
+ r = sm_disk_get_count(sm, b, &count);
+ if (r)
+ return r;
+
+ *result = count > 1;
+
+ return 0;
+}
+
+static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
+ uint32_t count)
+{
+ int r;
+ uint32_t old_count;
+ enum allocation_event ev;
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+
+ r = sm_ll_insert(&smd->ll, b, count, &ev);
+ if (!r) {
+ switch (ev) {
+ case SM_NONE:
+ break;
+
+ case SM_ALLOC:
+ /*
+ * This _must_ be free in the prior transaction
+ * otherwise we've lost atomicity.
+ */
+ smd->nr_allocated_this_transaction++;
+ break;
+
+ case SM_FREE:
+ /*
+ * It's only free if it's also free in the last
+ * transaction.
+ */
+ r = sm_ll_lookup(&smd->old_ll, b, &old_count);
+ if (r)
+ return r;
+
+ if (!old_count)
+ smd->nr_allocated_this_transaction--;
+ break;
+ }
+ }
+
+ return r;
+}
+
+static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
+{
+ int r;
+ enum allocation_event ev;
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+
+ r = sm_ll_inc(&smd->ll, b, &ev);
+ if (!r && (ev == SM_ALLOC))
+ /*
+ * This _must_ be free in the prior transaction
+ * otherwise we've lost atomicity.
+ */
+ smd->nr_allocated_this_transaction++;
+
+ return r;
+}
+
+static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
+{
+ enum allocation_event ev;
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+
+ return sm_ll_dec(&smd->ll, b, &ev);
+}
+
+static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
+{
+ int r;
+ enum allocation_event ev;
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+
+ /* FIXME: we should loop round a couple of times */
+ r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
+ if (r)
+ return r;
+
+ smd->begin = *b + 1;
+ r = sm_ll_inc(&smd->ll, *b, &ev);
+ if (!r) {
+ BUG_ON(ev != SM_ALLOC);
+ smd->nr_allocated_this_transaction++;
+ }
+
+ return r;
+}
+
+static int sm_disk_commit(struct dm_space_map *sm)
+{
+ int r;
+ dm_block_t nr_free;
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+
+ r = sm_disk_get_nr_free(sm, &nr_free);
+ if (r)
+ return r;
+
+ r = sm_ll_commit(&smd->ll);
+ if (r)
+ return r;
+
+ memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
+ smd->begin = 0;
+ smd->nr_allocated_this_transaction = 0;
+
+ r = sm_disk_get_nr_free(sm, &nr_free);
+ if (r)
+ return r;
+
+ return 0;
+}
+
+static int sm_disk_root_size(struct dm_space_map *sm, size_t *result)
+{
+ *result = sizeof(struct disk_sm_root);
+
+ return 0;
+}
+
+static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
+{
+ struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+ struct disk_sm_root root_le;
+
+ root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks);
+ root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated);
+ root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root);
+ root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root);
+
+ if (max < sizeof(root_le))
+ return -ENOSPC;
+
+ memcpy(where_le, &root_le, sizeof(root_le));
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_space_map ops = {
+ .destroy = sm_disk_destroy,
+ .extend = sm_disk_extend,
+ .get_nr_blocks = sm_disk_get_nr_blocks,
+ .get_nr_free = sm_disk_get_nr_free,
+ .get_count = sm_disk_get_count,
+ .count_is_more_than_one = sm_disk_count_is_more_than_one,
+ .set_count = sm_disk_set_count,
+ .inc_block = sm_disk_inc_block,
+ .dec_block = sm_disk_dec_block,
+ .new_block = sm_disk_new_block,
+ .commit = sm_disk_commit,
+ .root_size = sm_disk_root_size,
+ .copy_root = sm_disk_copy_root,
+ .register_threshold_callback = NULL
+};
+
+struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
+ dm_block_t nr_blocks)
+{
+ int r;
+ struct sm_disk *smd;
+
+ smd = kmalloc(sizeof(*smd), GFP_KERNEL);
+ if (!smd)
+ return ERR_PTR(-ENOMEM);
+
+ smd->begin = 0;
+ smd->nr_allocated_this_transaction = 0;
+ memcpy(&smd->sm, &ops, sizeof(smd->sm));
+
+ r = sm_ll_new_disk(&smd->ll, tm);
+ if (r)
+ goto bad;
+
+ r = sm_ll_extend(&smd->ll, nr_blocks);
+ if (r)
+ goto bad;
+
+ r = sm_disk_commit(&smd->sm);
+ if (r)
+ goto bad;
+
+ return &smd->sm;
+
+bad:
+ kfree(smd);
+ return ERR_PTR(r);
+}
+EXPORT_SYMBOL_GPL(dm_sm_disk_create);
+
+struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
+ void *root_le, size_t len)
+{
+ int r;
+ struct sm_disk *smd;
+
+ smd = kmalloc(sizeof(*smd), GFP_KERNEL);
+ if (!smd)
+ return ERR_PTR(-ENOMEM);
+
+ smd->begin = 0;
+ smd->nr_allocated_this_transaction = 0;
+ memcpy(&smd->sm, &ops, sizeof(smd->sm));
+
+ r = sm_ll_open_disk(&smd->ll, tm, root_le, len);
+ if (r)
+ goto bad;
+
+ r = sm_disk_commit(&smd->sm);
+ if (r)
+ goto bad;
+
+ return &smd->sm;
+
+bad:
+ kfree(smd);
+ return ERR_PTR(r);
+}
+EXPORT_SYMBOL_GPL(dm_sm_disk_open);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-disk.h b/drivers/md/persistent-data/dm-space-map-disk.h
new file mode 100644
index 000000000..447a0a9a2
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-disk.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _LINUX_DM_SPACE_MAP_DISK_H
+#define _LINUX_DM_SPACE_MAP_DISK_H
+
+#include "dm-block-manager.h"
+
+struct dm_space_map;
+struct dm_transaction_manager;
+
+/*
+ * Unfortunately we have to use two-phase construction due to the cycle
+ * between the tm and sm.
+ */
+struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
+ dm_block_t nr_blocks);
+
+struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
+ void *root, size_t len);
+
+#endif /* _LINUX_DM_SPACE_MAP_DISK_H */
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
new file mode 100644
index 000000000..53091295f
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-space-map.h"
+#include "dm-space-map-common.h"
+#include "dm-space-map-metadata.h"
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "space map metadata"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * An edge triggered threshold.
+ */
+struct threshold {
+ bool threshold_set;
+ bool value_set;
+ dm_block_t threshold;
+ dm_block_t current_value;
+ dm_sm_threshold_fn fn;
+ void *context;
+};
+
+static void threshold_init(struct threshold *t)
+{
+ t->threshold_set = false;
+ t->value_set = false;
+}
+
+static void set_threshold(struct threshold *t, dm_block_t value,
+ dm_sm_threshold_fn fn, void *context)
+{
+ t->threshold_set = true;
+ t->threshold = value;
+ t->fn = fn;
+ t->context = context;
+}
+
+static bool below_threshold(struct threshold *t, dm_block_t value)
+{
+ return t->threshold_set && value <= t->threshold;
+}
+
+static bool threshold_already_triggered(struct threshold *t)
+{
+ return t->value_set && below_threshold(t, t->current_value);
+}
+
+static void check_threshold(struct threshold *t, dm_block_t value)
+{
+ if (below_threshold(t, value) &&
+ !threshold_already_triggered(t))
+ t->fn(t->context);
+
+ t->value_set = true;
+ t->current_value = value;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Space map interface.
+ *
+ * The low level disk format is written using the standard btree and
+ * transaction manager. This means that performing disk operations may
+ * cause us to recurse into the space map in order to allocate new blocks.
+ * For this reason we have a pool of pre-allocated blocks large enough to
+ * service any metadata_ll_disk operation.
+ */
+
+/*
+ * FIXME: we should calculate this based on the size of the device.
+ * Only the metadata space map needs this functionality.
+ */
+#define MAX_RECURSIVE_ALLOCATIONS 1024
+
+enum block_op_type {
+ BOP_INC,
+ BOP_DEC
+};
+
+struct block_op {
+ enum block_op_type type;
+ dm_block_t block;
+};
+
+struct bop_ring_buffer {
+ unsigned begin;
+ unsigned end;
+ struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
+};
+
+static void brb_init(struct bop_ring_buffer *brb)
+{
+ brb->begin = 0;
+ brb->end = 0;
+}
+
+static bool brb_empty(struct bop_ring_buffer *brb)
+{
+ return brb->begin == brb->end;
+}
+
+static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
+{
+ unsigned r = old + 1;
+ return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+}
+
+static int brb_push(struct bop_ring_buffer *brb,
+ enum block_op_type type, dm_block_t b)
+{
+ struct block_op *bop;
+ unsigned next = brb_next(brb, brb->end);
+
+ /*
+ * We don't allow the last bop to be filled, this way we can
+ * differentiate between full and empty.
+ */
+ if (next == brb->begin)
+ return -ENOMEM;
+
+ bop = brb->bops + brb->end;
+ bop->type = type;
+ bop->block = b;
+
+ brb->end = next;
+
+ return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+{
+ struct block_op *bop;
+
+ if (brb_empty(brb))
+ return -ENODATA;
+
+ bop = brb->bops + brb->begin;
+ result->type = bop->type;
+ result->block = bop->block;
+
+ brb->begin = brb_next(brb, brb->begin);
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+struct sm_metadata {
+ struct dm_space_map sm;
+
+ struct ll_disk ll;
+ struct ll_disk old_ll;
+
+ dm_block_t begin;
+
+ unsigned recursion_count;
+ unsigned allocated_this_transaction;
+ struct bop_ring_buffer uncommitted;
+
+ struct threshold threshold;
+};
+
+static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
+{
+ int r = brb_push(&smm->uncommitted, type, b);
+
+ if (r) {
+ DMERR("too many recursive allocations");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int commit_bop(struct sm_metadata *smm, struct block_op *op)
+{
+ int r = 0;
+ enum allocation_event ev;
+
+ switch (op->type) {
+ case BOP_INC:
+ r = sm_ll_inc(&smm->ll, op->block, &ev);
+ break;
+
+ case BOP_DEC:
+ r = sm_ll_dec(&smm->ll, op->block, &ev);
+ break;
+ }
+
+ return r;
+}
+
+static void in(struct sm_metadata *smm)
+{
+ smm->recursion_count++;
+}
+
+static int apply_bops(struct sm_metadata *smm)
+{
+ int r = 0;
+
+ while (!brb_empty(&smm->uncommitted)) {
+ struct block_op bop;
+
+ r = brb_pop(&smm->uncommitted, &bop);
+ if (r) {
+ DMERR("bug in bop ring buffer");
+ break;
+ }
+
+ r = commit_bop(smm, &bop);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+static int out(struct sm_metadata *smm)
+{
+ int r = 0;
+
+ /*
+ * If we're not recursing then very bad things are happening.
+ */
+ if (!smm->recursion_count) {
+ DMERR("lost track of recursion depth");
+ return -ENOMEM;
+ }
+
+ if (smm->recursion_count == 1)
+ apply_bops(smm);
+
+ smm->recursion_count--;
+
+ return r;
+}
+
+/*
+ * When using the out() function above, we often want to combine an error
+ * code for the operation run in the recursive context with that from
+ * out().
+ */
+static int combine_errors(int r1, int r2)
+{
+ return r1 ? r1 : r2;
+}
+
+static int recursing(struct sm_metadata *smm)
+{
+ return smm->recursion_count;
+}
+
+static void sm_metadata_destroy(struct dm_space_map *sm)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ kfree(smm);
+}
+
+static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ *count = smm->ll.nr_blocks;
+
+ return 0;
+}
+
+static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated -
+ smm->allocated_this_transaction;
+
+ return 0;
+}
+
+static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
+ uint32_t *result)
+{
+ int r;
+ unsigned i;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+ unsigned adjustment = 0;
+
+ /*
+ * We may have some uncommitted adjustments to add. This list
+ * should always be really short.
+ */
+ for (i = smm->uncommitted.begin;
+ i != smm->uncommitted.end;
+ i = brb_next(&smm->uncommitted, i)) {
+ struct block_op *op = smm->uncommitted.bops + i;
+
+ if (op->block != b)
+ continue;
+
+ switch (op->type) {
+ case BOP_INC:
+ adjustment++;
+ break;
+
+ case BOP_DEC:
+ adjustment--;
+ break;
+ }
+ }
+
+ r = sm_ll_lookup(&smm->ll, b, result);
+ if (r)
+ return r;
+
+ *result += adjustment;
+
+ return 0;
+}
+
+static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
+ dm_block_t b, int *result)
+{
+ int r, adjustment = 0;
+ unsigned i;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+ uint32_t rc;
+
+ /*
+ * We may have some uncommitted adjustments to add. This list
+ * should always be really short.
+ */
+ for (i = smm->uncommitted.begin;
+ i != smm->uncommitted.end;
+ i = brb_next(&smm->uncommitted, i)) {
+
+ struct block_op *op = smm->uncommitted.bops + i;
+
+ if (op->block != b)
+ continue;
+
+ switch (op->type) {
+ case BOP_INC:
+ adjustment++;
+ break;
+
+ case BOP_DEC:
+ adjustment--;
+ break;
+ }
+ }
+
+ if (adjustment > 1) {
+ *result = 1;
+ return 0;
+ }
+
+ r = sm_ll_lookup_bitmap(&smm->ll, b, &rc);
+ if (r)
+ return r;
+
+ if (rc == 3)
+ /*
+ * We err on the side of caution, and always return true.
+ */
+ *result = 1;
+ else
+ *result = rc + adjustment > 1;
+
+ return 0;
+}
+
+static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
+ uint32_t count)
+{
+ int r, r2;
+ enum allocation_event ev;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ if (smm->recursion_count) {
+ DMERR("cannot recurse set_count()");
+ return -EINVAL;
+ }
+
+ in(smm);
+ r = sm_ll_insert(&smm->ll, b, count, &ev);
+ r2 = out(smm);
+
+ return combine_errors(r, r2);
+}
+
+static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
+{
+ int r, r2 = 0;
+ enum allocation_event ev;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ if (recursing(smm))
+ r = add_bop(smm, BOP_INC, b);
+ else {
+ in(smm);
+ r = sm_ll_inc(&smm->ll, b, &ev);
+ r2 = out(smm);
+ }
+
+ return combine_errors(r, r2);
+}
+
+static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
+{
+ int r, r2 = 0;
+ enum allocation_event ev;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ if (recursing(smm))
+ r = add_bop(smm, BOP_DEC, b);
+ else {
+ in(smm);
+ r = sm_ll_dec(&smm->ll, b, &ev);
+ r2 = out(smm);
+ }
+
+ return combine_errors(r, r2);
+}
+
+static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
+{
+ int r, r2 = 0;
+ enum allocation_event ev;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
+ if (r)
+ return r;
+
+ smm->begin = *b + 1;
+
+ if (recursing(smm))
+ r = add_bop(smm, BOP_INC, *b);
+ else {
+ in(smm);
+ r = sm_ll_inc(&smm->ll, *b, &ev);
+ r2 = out(smm);
+ }
+
+ if (!r)
+ smm->allocated_this_transaction++;
+
+ return combine_errors(r, r2);
+}
+
+static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
+{
+ dm_block_t count;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ int r = sm_metadata_new_block_(sm, b);
+ if (r) {
+ DMERR_LIMIT("unable to allocate new metadata block");
+ return r;
+ }
+
+ r = sm_metadata_get_nr_free(sm, &count);
+ if (r) {
+ DMERR_LIMIT("couldn't get free block count");
+ return r;
+ }
+
+ check_threshold(&smm->threshold, count);
+
+ return r;
+}
+
+static int sm_metadata_commit(struct dm_space_map *sm)
+{
+ int r;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ r = sm_ll_commit(&smm->ll);
+ if (r)
+ return r;
+
+ memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
+ smm->begin = 0;
+ smm->allocated_this_transaction = 0;
+
+ return 0;
+}
+
+static int sm_metadata_register_threshold_callback(struct dm_space_map *sm,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ set_threshold(&smm->threshold, threshold, fn, context);
+
+ return 0;
+}
+
+static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result)
+{
+ *result = sizeof(struct disk_sm_root);
+
+ return 0;
+}
+
+static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+ struct disk_sm_root root_le;
+
+ root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks);
+ root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated);
+ root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root);
+ root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root);
+
+ if (max < sizeof(root_le))
+ return -ENOSPC;
+
+ memcpy(where_le, &root_le, sizeof(root_le));
+
+ return 0;
+}
+
+static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks);
+
+static struct dm_space_map ops = {
+ .destroy = sm_metadata_destroy,
+ .extend = sm_metadata_extend,
+ .get_nr_blocks = sm_metadata_get_nr_blocks,
+ .get_nr_free = sm_metadata_get_nr_free,
+ .get_count = sm_metadata_get_count,
+ .count_is_more_than_one = sm_metadata_count_is_more_than_one,
+ .set_count = sm_metadata_set_count,
+ .inc_block = sm_metadata_inc_block,
+ .dec_block = sm_metadata_dec_block,
+ .new_block = sm_metadata_new_block,
+ .commit = sm_metadata_commit,
+ .root_size = sm_metadata_root_size,
+ .copy_root = sm_metadata_copy_root,
+ .register_threshold_callback = sm_metadata_register_threshold_callback
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * When a new space map is created that manages its own space. We use
+ * this tiny bootstrap allocator.
+ */
+static void sm_bootstrap_destroy(struct dm_space_map *sm)
+{
+}
+
+static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+{
+ DMERR("bootstrap doesn't support extend");
+
+ return -EINVAL;
+}
+
+static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ *count = smm->ll.nr_blocks;
+
+ return 0;
+}
+
+static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ *count = smm->ll.nr_blocks - smm->begin;
+
+ return 0;
+}
+
+static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
+ uint32_t *result)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ *result = (b < smm->begin) ? 1 : 0;
+
+ return 0;
+}
+
+static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
+ dm_block_t b, int *result)
+{
+ *result = 0;
+
+ return 0;
+}
+
+static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b,
+ uint32_t count)
+{
+ DMERR("bootstrap doesn't support set_count");
+
+ return -EINVAL;
+}
+
+static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ /*
+ * We know the entire device is unused.
+ */
+ if (smm->begin == smm->ll.nr_blocks)
+ return -ENOSPC;
+
+ *b = smm->begin++;
+
+ return 0;
+}
+
+static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ return add_bop(smm, BOP_INC, b);
+}
+
+static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ return add_bop(smm, BOP_DEC, b);
+}
+
+static int sm_bootstrap_commit(struct dm_space_map *sm)
+{
+ return 0;
+}
+
+static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
+{
+ DMERR("bootstrap doesn't support root_size");
+
+ return -EINVAL;
+}
+
+static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
+ size_t max)
+{
+ DMERR("bootstrap doesn't support copy_root");
+
+ return -EINVAL;
+}
+
+static struct dm_space_map bootstrap_ops = {
+ .destroy = sm_bootstrap_destroy,
+ .extend = sm_bootstrap_extend,
+ .get_nr_blocks = sm_bootstrap_get_nr_blocks,
+ .get_nr_free = sm_bootstrap_get_nr_free,
+ .get_count = sm_bootstrap_get_count,
+ .count_is_more_than_one = sm_bootstrap_count_is_more_than_one,
+ .set_count = sm_bootstrap_set_count,
+ .inc_block = sm_bootstrap_inc_block,
+ .dec_block = sm_bootstrap_dec_block,
+ .new_block = sm_bootstrap_new_block,
+ .commit = sm_bootstrap_commit,
+ .root_size = sm_bootstrap_root_size,
+ .copy_root = sm_bootstrap_copy_root,
+ .register_threshold_callback = NULL
+};
+
+/*----------------------------------------------------------------*/
+
+static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+{
+ int r, i;
+ enum allocation_event ev;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+ dm_block_t old_len = smm->ll.nr_blocks;
+
+ /*
+ * Flick into a mode where all blocks get allocated in the new area.
+ */
+ smm->begin = old_len;
+ memcpy(sm, &bootstrap_ops, sizeof(*sm));
+
+ /*
+ * Extend.
+ */
+ r = sm_ll_extend(&smm->ll, extra_blocks);
+ if (r)
+ goto out;
+
+ /*
+ * We repeatedly increment then commit until the commit doesn't
+ * allocate any new blocks.
+ */
+ do {
+ for (i = old_len; !r && i < smm->begin; i++) {
+ r = sm_ll_inc(&smm->ll, i, &ev);
+ if (r)
+ goto out;
+ }
+ old_len = smm->begin;
+
+ r = apply_bops(smm);
+ if (r) {
+ DMERR("%s: apply_bops failed", __func__);
+ goto out;
+ }
+
+ r = sm_ll_commit(&smm->ll);
+ if (r)
+ goto out;
+
+ } while (old_len != smm->begin);
+
+out:
+ /*
+ * Switch back to normal behaviour.
+ */
+ memcpy(sm, &ops, sizeof(*sm));
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
+struct dm_space_map *dm_sm_metadata_init(void)
+{
+ struct sm_metadata *smm;
+
+ smm = kmalloc(sizeof(*smm), GFP_KERNEL);
+ if (!smm)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(&smm->sm, &ops, sizeof(smm->sm));
+
+ return &smm->sm;
+}
+
+int dm_sm_metadata_create(struct dm_space_map *sm,
+ struct dm_transaction_manager *tm,
+ dm_block_t nr_blocks,
+ dm_block_t superblock)
+{
+ int r;
+ dm_block_t i;
+ enum allocation_event ev;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ smm->begin = superblock + 1;
+ smm->recursion_count = 0;
+ smm->allocated_this_transaction = 0;
+ brb_init(&smm->uncommitted);
+ threshold_init(&smm->threshold);
+
+ memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
+
+ r = sm_ll_new_metadata(&smm->ll, tm);
+ if (r)
+ return r;
+
+ if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
+ nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
+ r = sm_ll_extend(&smm->ll, nr_blocks);
+ if (r)
+ return r;
+
+ memcpy(&smm->sm, &ops, sizeof(smm->sm));
+
+ /*
+ * Now we need to update the newly created data structures with the
+ * allocated blocks that they were built from.
+ */
+ for (i = superblock; !r && i < smm->begin; i++)
+ r = sm_ll_inc(&smm->ll, i, &ev);
+
+ if (r)
+ return r;
+
+ r = apply_bops(smm);
+ if (r) {
+ DMERR("%s: apply_bops failed", __func__);
+ return r;
+ }
+
+ return sm_metadata_commit(sm);
+}
+
+int dm_sm_metadata_open(struct dm_space_map *sm,
+ struct dm_transaction_manager *tm,
+ void *root_le, size_t len)
+{
+ int r;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ r = sm_ll_open_metadata(&smm->ll, tm, root_le, len);
+ if (r)
+ return r;
+
+ smm->begin = 0;
+ smm->recursion_count = 0;
+ smm->allocated_this_transaction = 0;
+ brb_init(&smm->uncommitted);
+ threshold_init(&smm->threshold);
+
+ memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
+ return 0;
+}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
new file mode 100644
index 000000000..64df92397
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_SPACE_MAP_METADATA_H
+#define DM_SPACE_MAP_METADATA_H
+
+#include "dm-transaction-manager.h"
+
+#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
+
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries. Each
+ * index entry contains allocation info about ~16k metadata blocks.
+ */
+#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
+#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
+
+/*
+ * Unfortunately we have to use two-phase construction due to the cycle
+ * between the tm and sm.
+ */
+struct dm_space_map *dm_sm_metadata_init(void);
+
+/*
+ * Create a fresh space map.
+ */
+int dm_sm_metadata_create(struct dm_space_map *sm,
+ struct dm_transaction_manager *tm,
+ dm_block_t nr_blocks,
+ dm_block_t superblock);
+
+/*
+ * Open from a previously-recorded root.
+ */
+int dm_sm_metadata_open(struct dm_space_map *sm,
+ struct dm_transaction_manager *tm,
+ void *root_le, size_t len);
+
+#endif /* DM_SPACE_MAP_METADATA_H */
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
new file mode 100644
index 000000000..3e6d1153b
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _LINUX_DM_SPACE_MAP_H
+#define _LINUX_DM_SPACE_MAP_H
+
+#include "dm-block-manager.h"
+
+typedef void (*dm_sm_threshold_fn)(void *context);
+
+/*
+ * struct dm_space_map keeps a record of how many times each block in a device
+ * is referenced. It needs to be fixed on disk as part of the transaction.
+ */
+struct dm_space_map {
+ void (*destroy)(struct dm_space_map *sm);
+
+ /*
+ * You must commit before allocating the newly added space.
+ */
+ int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks);
+
+ /*
+ * Extensions do not appear in this count until after commit has
+ * been called.
+ */
+ int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count);
+
+ /*
+ * Space maps must never allocate a block from the previous
+ * transaction, in case we need to rollback. This complicates the
+ * semantics of get_nr_free(), it should return the number of blocks
+ * that are available for allocation _now_. For instance you may
+ * have blocks with a zero reference count that will not be
+ * available for allocation until after the next commit.
+ */
+ int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count);
+
+ int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result);
+ int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b,
+ int *result);
+ int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count);
+
+ int (*commit)(struct dm_space_map *sm);
+
+ int (*inc_block)(struct dm_space_map *sm, dm_block_t b);
+ int (*dec_block)(struct dm_space_map *sm, dm_block_t b);
+
+ /*
+ * new_block will increment the returned block.
+ */
+ int (*new_block)(struct dm_space_map *sm, dm_block_t *b);
+
+ /*
+ * The root contains all the information needed to fix the space map.
+ * Generally this info is small, so squirrel it away in a disk block
+ * along with other info.
+ */
+ int (*root_size)(struct dm_space_map *sm, size_t *result);
+ int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len);
+
+ /*
+ * You can register one threshold callback which is edge-triggered
+ * when the free space in the space map drops below the threshold.
+ */
+ int (*register_threshold_callback)(struct dm_space_map *sm,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context);
+};
+
+/*----------------------------------------------------------------*/
+
+static inline void dm_sm_destroy(struct dm_space_map *sm)
+{
+ sm->destroy(sm);
+}
+
+static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+{
+ return sm->extend(sm, extra_blocks);
+}
+
+static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
+{
+ return sm->get_nr_blocks(sm, count);
+}
+
+static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
+{
+ return sm->get_nr_free(sm, count);
+}
+
+static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b,
+ uint32_t *result)
+{
+ return sm->get_count(sm, b, result);
+}
+
+static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm,
+ dm_block_t b, int *result)
+{
+ return sm->count_is_more_than_one(sm, b, result);
+}
+
+static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b,
+ uint32_t count)
+{
+ return sm->set_count(sm, b, count);
+}
+
+static inline int dm_sm_commit(struct dm_space_map *sm)
+{
+ return sm->commit(sm);
+}
+
+static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b)
+{
+ return sm->inc_block(sm, b);
+}
+
+static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b)
+{
+ return sm->dec_block(sm, b);
+}
+
+static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b)
+{
+ return sm->new_block(sm, b);
+}
+
+static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result)
+{
+ return sm->root_size(sm, result);
+}
+
+static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
+{
+ return sm->copy_root(sm, copy_to_here_le, len);
+}
+
+static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context)
+{
+ if (sm->register_threshold_callback)
+ return sm->register_threshold_callback(sm, threshold, fn, context);
+
+ return -EINVAL;
+}
+
+
+#endif /* _LINUX_DM_SPACE_MAP_H */
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
new file mode 100644
index 000000000..9cb797d80
--- /dev/null
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-transaction-manager.h"
+#include "dm-space-map.h"
+#include "dm-space-map-disk.h"
+#include "dm-space-map-metadata.h"
+#include "dm-persistent-data-internal.h"
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "transaction manager"
+
+/*----------------------------------------------------------------*/
+
+#define PREFETCH_SIZE 128
+#define PREFETCH_BITS 7
+#define PREFETCH_SENTINEL ((dm_block_t) -1ULL)
+
+struct prefetch_set {
+ struct mutex lock;
+ dm_block_t blocks[PREFETCH_SIZE];
+};
+
+static unsigned prefetch_hash(dm_block_t b)
+{
+ return hash_64(b, PREFETCH_BITS);
+}
+
+static void prefetch_wipe(struct prefetch_set *p)
+{
+ unsigned i;
+ for (i = 0; i < PREFETCH_SIZE; i++)
+ p->blocks[i] = PREFETCH_SENTINEL;
+}
+
+static void prefetch_init(struct prefetch_set *p)
+{
+ mutex_init(&p->lock);
+ prefetch_wipe(p);
+}
+
+static void prefetch_add(struct prefetch_set *p, dm_block_t b)
+{
+ unsigned h = prefetch_hash(b);
+
+ mutex_lock(&p->lock);
+ if (p->blocks[h] == PREFETCH_SENTINEL)
+ p->blocks[h] = b;
+
+ mutex_unlock(&p->lock);
+}
+
+static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm)
+{
+ unsigned i;
+
+ mutex_lock(&p->lock);
+
+ for (i = 0; i < PREFETCH_SIZE; i++)
+ if (p->blocks[i] != PREFETCH_SENTINEL) {
+ dm_bm_prefetch(bm, p->blocks[i]);
+ p->blocks[i] = PREFETCH_SENTINEL;
+ }
+
+ mutex_unlock(&p->lock);
+}
+
+/*----------------------------------------------------------------*/
+
+struct shadow_info {
+ struct hlist_node hlist;
+ dm_block_t where;
+};
+
+/*
+ * It would be nice if we scaled with the size of transaction.
+ */
+#define DM_HASH_SIZE 256
+#define DM_HASH_MASK (DM_HASH_SIZE - 1)
+
+struct dm_transaction_manager {
+ int is_clone;
+ struct dm_transaction_manager *real;
+
+ struct dm_block_manager *bm;
+ struct dm_space_map *sm;
+
+ spinlock_t lock;
+ struct hlist_head buckets[DM_HASH_SIZE];
+
+ struct prefetch_set prefetches;
+};
+
+/*----------------------------------------------------------------*/
+
+static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
+{
+ int r = 0;
+ unsigned bucket = dm_hash_block(b, DM_HASH_MASK);
+ struct shadow_info *si;
+
+ spin_lock(&tm->lock);
+ hlist_for_each_entry(si, tm->buckets + bucket, hlist)
+ if (si->where == b) {
+ r = 1;
+ break;
+ }
+ spin_unlock(&tm->lock);
+
+ return r;
+}
+
+/*
+ * This can silently fail if there's no memory. We're ok with this since
+ * creating redundant shadows causes no harm.
+ */
+static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
+{
+ unsigned bucket;
+ struct shadow_info *si;
+
+ si = kmalloc(sizeof(*si), GFP_NOIO);
+ if (si) {
+ si->where = b;
+ bucket = dm_hash_block(b, DM_HASH_MASK);
+ spin_lock(&tm->lock);
+ hlist_add_head(&si->hlist, tm->buckets + bucket);
+ spin_unlock(&tm->lock);
+ }
+}
+
+static void wipe_shadow_table(struct dm_transaction_manager *tm)
+{
+ struct shadow_info *si;
+ struct hlist_node *tmp;
+ struct hlist_head *bucket;
+ int i;
+
+ spin_lock(&tm->lock);
+ for (i = 0; i < DM_HASH_SIZE; i++) {
+ bucket = tm->buckets + i;
+ hlist_for_each_entry_safe(si, tmp, bucket, hlist)
+ kfree(si);
+
+ INIT_HLIST_HEAD(bucket);
+ }
+
+ spin_unlock(&tm->lock);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
+ struct dm_space_map *sm)
+{
+ int i;
+ struct dm_transaction_manager *tm;
+
+ tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+ if (!tm)
+ return ERR_PTR(-ENOMEM);
+
+ tm->is_clone = 0;
+ tm->real = NULL;
+ tm->bm = bm;
+ tm->sm = sm;
+
+ spin_lock_init(&tm->lock);
+ for (i = 0; i < DM_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(tm->buckets + i);
+
+ prefetch_init(&tm->prefetches);
+
+ return tm;
+}
+
+struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real)
+{
+ struct dm_transaction_manager *tm;
+
+ tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+ if (tm) {
+ tm->is_clone = 1;
+ tm->real = real;
+ }
+
+ return tm;
+}
+EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
+
+void dm_tm_destroy(struct dm_transaction_manager *tm)
+{
+ if (!tm->is_clone)
+ wipe_shadow_table(tm);
+
+ kfree(tm);
+}
+EXPORT_SYMBOL_GPL(dm_tm_destroy);
+
+int dm_tm_pre_commit(struct dm_transaction_manager *tm)
+{
+ int r;
+
+ if (tm->is_clone)
+ return -EWOULDBLOCK;
+
+ r = dm_sm_commit(tm->sm);
+ if (r < 0)
+ return r;
+
+ return dm_bm_flush(tm->bm);
+}
+EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
+
+int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
+{
+ if (tm->is_clone)
+ return -EWOULDBLOCK;
+
+ wipe_shadow_table(tm);
+ dm_bm_unlock(root);
+
+ return dm_bm_flush(tm->bm);
+}
+EXPORT_SYMBOL_GPL(dm_tm_commit);
+
+int dm_tm_new_block(struct dm_transaction_manager *tm,
+ struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ int r;
+ dm_block_t new_block;
+
+ if (tm->is_clone)
+ return -EWOULDBLOCK;
+
+ r = dm_sm_new_block(tm->sm, &new_block);
+ if (r < 0)
+ return r;
+
+ r = dm_bm_write_lock_zero(tm->bm, new_block, v, result);
+ if (r < 0) {
+ dm_sm_dec_block(tm->sm, new_block);
+ return r;
+ }
+
+ /*
+ * New blocks count as shadows in that they don't need to be
+ * shadowed again.
+ */
+ insert_shadow(tm, new_block);
+
+ return 0;
+}
+
+static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
+ struct dm_block_validator *v,
+ struct dm_block **result)
+{
+ int r;
+ dm_block_t new;
+ struct dm_block *orig_block;
+
+ r = dm_sm_new_block(tm->sm, &new);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_dec_block(tm->sm, orig);
+ if (r < 0)
+ return r;
+
+ r = dm_bm_read_lock(tm->bm, orig, v, &orig_block);
+ if (r < 0)
+ return r;
+
+ /*
+ * It would be tempting to use dm_bm_unlock_move here, but some
+ * code, such as the space maps, keeps using the old data structures
+ * secure in the knowledge they won't be changed until the next
+ * transaction. Using unlock_move would force a synchronous read
+ * since the old block would no longer be in the cache.
+ */
+ r = dm_bm_write_lock_zero(tm->bm, new, v, result);
+ if (r) {
+ dm_bm_unlock(orig_block);
+ return r;
+ }
+
+ memcpy(dm_block_data(*result), dm_block_data(orig_block),
+ dm_bm_block_size(tm->bm));
+
+ dm_bm_unlock(orig_block);
+ return r;
+}
+
+int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
+ struct dm_block_validator *v, struct dm_block **result,
+ int *inc_children)
+{
+ int r;
+
+ if (tm->is_clone)
+ return -EWOULDBLOCK;
+
+ r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children);
+ if (r < 0)
+ return r;
+
+ if (is_shadow(tm, orig) && !*inc_children)
+ return dm_bm_write_lock(tm->bm, orig, v, result);
+
+ r = __shadow_block(tm, orig, v, result);
+ if (r < 0)
+ return r;
+ insert_shadow(tm, dm_block_location(*result));
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_tm_shadow_block);
+
+int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **blk)
+{
+ if (tm->is_clone) {
+ int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk);
+
+ if (r == -EWOULDBLOCK)
+ prefetch_add(&tm->real->prefetches, b);
+
+ return r;
+ }
+
+ return dm_bm_read_lock(tm->bm, b, v, blk);
+}
+EXPORT_SYMBOL_GPL(dm_tm_read_lock);
+
+int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
+{
+ return dm_bm_unlock(b);
+}
+EXPORT_SYMBOL_GPL(dm_tm_unlock);
+
+void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
+{
+ /*
+ * The non-blocking clone doesn't support this.
+ */
+ BUG_ON(tm->is_clone);
+
+ dm_sm_inc_block(tm->sm, b);
+}
+EXPORT_SYMBOL_GPL(dm_tm_inc);
+
+void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
+{
+ /*
+ * The non-blocking clone doesn't support this.
+ */
+ BUG_ON(tm->is_clone);
+
+ dm_sm_dec_block(tm->sm, b);
+}
+EXPORT_SYMBOL_GPL(dm_tm_dec);
+
+int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
+ uint32_t *result)
+{
+ if (tm->is_clone)
+ return -EWOULDBLOCK;
+
+ return dm_sm_get_count(tm->sm, b, result);
+}
+
+struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
+{
+ return tm->bm;
+}
+
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm)
+{
+ prefetch_issue(&tm->prefetches, tm->bm);
+}
+EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches);
+
+/*----------------------------------------------------------------*/
+
+static int dm_tm_create_internal(struct dm_block_manager *bm,
+ dm_block_t sb_location,
+ struct dm_transaction_manager **tm,
+ struct dm_space_map **sm,
+ int create,
+ void *sm_root, size_t sm_len)
+{
+ int r;
+
+ *sm = dm_sm_metadata_init();
+ if (IS_ERR(*sm))
+ return PTR_ERR(*sm);
+
+ *tm = dm_tm_create(bm, *sm);
+ if (IS_ERR(*tm)) {
+ dm_sm_destroy(*sm);
+ return PTR_ERR(*tm);
+ }
+
+ if (create) {
+ r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm),
+ sb_location);
+ if (r) {
+ DMERR("couldn't create metadata space map");
+ goto bad;
+ }
+
+ } else {
+ r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len);
+ if (r) {
+ DMERR("couldn't open metadata space map");
+ goto bad;
+ }
+ }
+
+ return 0;
+
+bad:
+ dm_tm_destroy(*tm);
+ dm_sm_destroy(*sm);
+ return r;
+}
+
+int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
+ struct dm_transaction_manager **tm,
+ struct dm_space_map **sm)
+{
+ return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
+
+int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
+ void *sm_root, size_t root_len,
+ struct dm_transaction_manager **tm,
+ struct dm_space_map **sm)
+{
+ return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len);
+}
+EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
new file mode 100644
index 000000000..2e0d4d66f
--- /dev/null
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _LINUX_DM_TRANSACTION_MANAGER_H
+#define _LINUX_DM_TRANSACTION_MANAGER_H
+
+#include "dm-block-manager.h"
+
+struct dm_transaction_manager;
+struct dm_space_map;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This manages the scope of a transaction. It also enforces immutability
+ * of the on-disk data structures by limiting access to writeable blocks.
+ *
+ * Clients should not fiddle with the block manager directly.
+ */
+
+void dm_tm_destroy(struct dm_transaction_manager *tm);
+
+/*
+ * The non-blocking version of a transaction manager is intended for use in
+ * fast path code that needs to do lookups e.g. a dm mapping function.
+ * You create the non-blocking variant from a normal tm. The interface is
+ * the same, except that most functions will just return -EWOULDBLOCK.
+ * Methods that return void yet may block should not be called on a clone
+ * viz. dm_tm_inc, dm_tm_dec. Call dm_tm_destroy() as you would with a normal
+ * tm when you've finished with it. You may not destroy the original prior
+ * to clones.
+ */
+struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real);
+
+/*
+ * We use a 2-phase commit here.
+ *
+ * i) Make all changes for the transaction *except* for the superblock.
+ * Then call dm_tm_pre_commit() to flush them to disk.
+ *
+ * ii) Lock your superblock. Update. Then call dm_tm_commit() which will
+ * unlock the superblock and flush it. No other blocks should be updated
+ * during this period. Care should be taken to never unlock a partially
+ * updated superblock; perform any operations that could fail *before* you
+ * take the superblock lock.
+ */
+int dm_tm_pre_commit(struct dm_transaction_manager *tm);
+int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock);
+
+/*
+ * These methods are the only way to get hold of a writeable block.
+ */
+
+/*
+ * dm_tm_new_block() is pretty self-explanatory. Make sure you do actually
+ * write to the whole of @data before you unlock, otherwise you could get
+ * a data leak. (The other option is for tm_new_block() to zero new blocks
+ * before handing them out, which will be redundant in most, if not all,
+ * cases).
+ * Zeroes the new block and returns with write lock held.
+ */
+int dm_tm_new_block(struct dm_transaction_manager *tm,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+/*
+ * dm_tm_shadow_block() allocates a new block and copies the data from @orig
+ * to it. It then decrements the reference count on original block. Use
+ * this to update the contents of a block in a data structure, don't
+ * confuse this with a clone - you shouldn't access the orig block after
+ * this operation. Because the tm knows the scope of the transaction it
+ * can optimise requests for a shadow of a shadow to a no-op. Don't forget
+ * to unlock when you've finished with the shadow.
+ *
+ * The @inc_children flag is used to tell the caller whether it needs to
+ * adjust reference counts for children. (Data in the block may refer to
+ * other blocks.)
+ *
+ * Shadowing implicitly drops a reference on @orig so you must not have
+ * it locked when you call this.
+ */
+int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
+ struct dm_block_validator *v,
+ struct dm_block **result, int *inc_children);
+
+/*
+ * Read access. You can lock any block you want. If there's a write lock
+ * on it outstanding then it'll block.
+ */
+int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
+ struct dm_block_validator *v,
+ struct dm_block **result);
+
+int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
+
+/*
+ * Functions for altering the reference count of a block directly.
+ */
+void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b);
+
+void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b);
+
+int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
+ uint32_t *result);
+
+struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
+
+/*
+ * If you're using a non-blocking clone the tm will build up a list of
+ * requested blocks that weren't in core. This call will request those
+ * blocks to be prefetched.
+ */
+void dm_tm_issue_prefetches(struct dm_transaction_manager *tm);
+
+/*
+ * A little utility that ties the knot by producing a transaction manager
+ * that has a space map managed by the transaction manager...
+ *
+ * Returns a tm that has an open transaction to write the new disk sm.
+ * Caller should store the new sm root and commit.
+ *
+ * The superblock location is passed so the metadata space map knows it
+ * shouldn't be used.
+ */
+int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
+ struct dm_transaction_manager **tm,
+ struct dm_space_map **sm);
+
+int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
+ void *sm_root, size_t root_len,
+ struct dm_transaction_manager **tm,
+ struct dm_space_map **sm);
+
+#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
new file mode 100644
index 000000000..efb654eb5
--- /dev/null
+++ b/drivers/md/raid0.c
@@ -0,0 +1,749 @@
+/*
+ raid0.c : Multiple Devices driver for Linux
+ Copyright (C) 1994-96 Marc ZYNGIER
+ <zyngier@ufr-info-p7.ibp.fr> or
+ <maz@gloups.fdn.fr>
+ Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
+
+ RAID-0 management functions.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "md.h"
+#include "raid0.h"
+#include "raid5.h"
+
+static int raid0_congested(struct mddev *mddev, int bits)
+{
+ struct r0conf *conf = mddev->private;
+ struct md_rdev **devlist = conf->devlist;
+ int raid_disks = conf->strip_zone[0].nb_dev;
+ int i, ret = 0;
+
+ for (i = 0; i < raid_disks && !ret ; i++) {
+ struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+ return ret;
+}
+
+/*
+ * inform the user of the raid configuration
+*/
+static void dump_zones(struct mddev *mddev)
+{
+ int j, k;
+ sector_t zone_size = 0;
+ sector_t zone_start = 0;
+ char b[BDEVNAME_SIZE];
+ struct r0conf *conf = mddev->private;
+ int raid_disks = conf->strip_zone[0].nb_dev;
+ printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n",
+ mdname(mddev),
+ conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
+ for (j = 0; j < conf->nr_strip_zones; j++) {
+ printk(KERN_INFO "md: zone%d=[", j);
+ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+ printk(KERN_CONT "%s%s", k?"/":"",
+ bdevname(conf->devlist[j*raid_disks
+ + k]->bdev, b));
+ printk(KERN_CONT "]\n");
+
+ zone_size = conf->strip_zone[j].zone_end - zone_start;
+ printk(KERN_INFO " zone-offset=%10lluKB, "
+ "device-offset=%10lluKB, size=%10lluKB\n",
+ (unsigned long long)zone_start>>1,
+ (unsigned long long)conf->strip_zone[j].dev_start>>1,
+ (unsigned long long)zone_size>>1);
+ zone_start = conf->strip_zone[j].zone_end;
+ }
+ printk(KERN_INFO "\n");
+}
+
+static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
+{
+ int i, c, err;
+ sector_t curr_zone_end, sectors;
+ struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev;
+ struct strip_zone *zone;
+ int cnt;
+ char b[BDEVNAME_SIZE];
+ char b2[BDEVNAME_SIZE];
+ struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+ bool discard_supported = false;
+
+ if (!conf)
+ return -ENOMEM;
+ rdev_for_each(rdev1, mddev) {
+ pr_debug("md/raid0:%s: looking at %s\n",
+ mdname(mddev),
+ bdevname(rdev1->bdev, b));
+ c = 0;
+
+ /* round size to chunk_size */
+ sectors = rdev1->sectors;
+ sector_div(sectors, mddev->chunk_sectors);
+ rdev1->sectors = sectors * mddev->chunk_sectors;
+
+ rdev_for_each(rdev2, mddev) {
+ pr_debug("md/raid0:%s: comparing %s(%llu)"
+ " with %s(%llu)\n",
+ mdname(mddev),
+ bdevname(rdev1->bdev,b),
+ (unsigned long long)rdev1->sectors,
+ bdevname(rdev2->bdev,b2),
+ (unsigned long long)rdev2->sectors);
+ if (rdev2 == rdev1) {
+ pr_debug("md/raid0:%s: END\n",
+ mdname(mddev));
+ break;
+ }
+ if (rdev2->sectors == rdev1->sectors) {
+ /*
+ * Not unique, don't count it as a new
+ * group
+ */
+ pr_debug("md/raid0:%s: EQUAL\n",
+ mdname(mddev));
+ c = 1;
+ break;
+ }
+ pr_debug("md/raid0:%s: NOT EQUAL\n",
+ mdname(mddev));
+ }
+ if (!c) {
+ pr_debug("md/raid0:%s: ==> UNIQUE\n",
+ mdname(mddev));
+ conf->nr_strip_zones++;
+ pr_debug("md/raid0:%s: %d zones\n",
+ mdname(mddev), conf->nr_strip_zones);
+ }
+ }
+ pr_debug("md/raid0:%s: FINAL %d zones\n",
+ mdname(mddev), conf->nr_strip_zones);
+ err = -ENOMEM;
+ conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
+ conf->nr_strip_zones, GFP_KERNEL);
+ if (!conf->strip_zone)
+ goto abort;
+ conf->devlist = kzalloc(sizeof(struct md_rdev*)*
+ conf->nr_strip_zones*mddev->raid_disks,
+ GFP_KERNEL);
+ if (!conf->devlist)
+ goto abort;
+
+ /* The first zone must contain all devices, so here we check that
+ * there is a proper alignment of slots to devices and find them all
+ */
+ zone = &conf->strip_zone[0];
+ cnt = 0;
+ smallest = NULL;
+ dev = conf->devlist;
+ err = -EINVAL;
+ rdev_for_each(rdev1, mddev) {
+ int j = rdev1->raid_disk;
+
+ if (mddev->level == 10) {
+ /* taking over a raid10-n2 array */
+ j /= 2;
+ rdev1->new_raid_disk = j;
+ }
+
+ if (mddev->level == 1) {
+ /* taiking over a raid1 array-
+ * we have only one active disk
+ */
+ j = 0;
+ rdev1->new_raid_disk = j;
+ }
+
+ if (j < 0) {
+ printk(KERN_ERR
+ "md/raid0:%s: remove inactive devices before converting to RAID0\n",
+ mdname(mddev));
+ goto abort;
+ }
+ if (j >= mddev->raid_disks) {
+ printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
+ "aborting!\n", mdname(mddev), j);
+ goto abort;
+ }
+ if (dev[j]) {
+ printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
+ "aborting!\n", mdname(mddev), j);
+ goto abort;
+ }
+ dev[j] = rdev1;
+
+ if (mddev->queue)
+ disk_stack_limits(mddev->gendisk, rdev1->bdev,
+ rdev1->data_offset << 9);
+
+ if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
+ conf->has_merge_bvec = 1;
+
+ if (!smallest || (rdev1->sectors < smallest->sectors))
+ smallest = rdev1;
+ cnt++;
+
+ if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
+ discard_supported = true;
+ }
+ if (cnt != mddev->raid_disks) {
+ printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
+ "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
+ goto abort;
+ }
+ zone->nb_dev = cnt;
+ zone->zone_end = smallest->sectors * cnt;
+
+ curr_zone_end = zone->zone_end;
+
+ /* now do the other zones */
+ for (i = 1; i < conf->nr_strip_zones; i++)
+ {
+ int j;
+
+ zone = conf->strip_zone + i;
+ dev = conf->devlist + i * mddev->raid_disks;
+
+ pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i);
+ zone->dev_start = smallest->sectors;
+ smallest = NULL;
+ c = 0;
+
+ for (j=0; j<cnt; j++) {
+ rdev = conf->devlist[j];
+ if (rdev->sectors <= zone->dev_start) {
+ pr_debug("md/raid0:%s: checking %s ... nope\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b));
+ continue;
+ }
+ pr_debug("md/raid0:%s: checking %s ..."
+ " contained as device %d\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b), c);
+ dev[c] = rdev;
+ c++;
+ if (!smallest || rdev->sectors < smallest->sectors) {
+ smallest = rdev;
+ pr_debug("md/raid0:%s: (%llu) is smallest!.\n",
+ mdname(mddev),
+ (unsigned long long)rdev->sectors);
+ }
+ }
+
+ zone->nb_dev = c;
+ sectors = (smallest->sectors - zone->dev_start) * c;
+ pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
+ mdname(mddev),
+ zone->nb_dev, (unsigned long long)sectors);
+
+ curr_zone_end += sectors;
+ zone->zone_end = curr_zone_end;
+
+ pr_debug("md/raid0:%s: current zone start: %llu\n",
+ mdname(mddev),
+ (unsigned long long)smallest->sectors);
+ }
+
+ /*
+ * now since we have the hard sector sizes, we can make sure
+ * chunk size is a multiple of that sector size
+ */
+ if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
+ printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
+ mdname(mddev),
+ mddev->chunk_sectors << 9);
+ goto abort;
+ }
+
+ if (mddev->queue) {
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+ blk_queue_io_opt(mddev->queue,
+ (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
+ if (!discard_supported)
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ }
+
+ pr_debug("md/raid0:%s: done.\n", mdname(mddev));
+ *private_conf = conf;
+
+ return 0;
+abort:
+ kfree(conf->strip_zone);
+ kfree(conf->devlist);
+ kfree(conf);
+ *private_conf = ERR_PTR(err);
+ return err;
+}
+
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct r0conf *conf,
+ sector_t *sectorp)
+{
+ int i;
+ struct strip_zone *z = conf->strip_zone;
+ sector_t sector = *sectorp;
+
+ for (i = 0; i < conf->nr_strip_zones; i++)
+ if (sector < z[i].zone_end) {
+ if (i)
+ *sectorp = sector - z[i-1].zone_end;
+ return z + i;
+ }
+ BUG();
+}
+
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of performance
+*/
+static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
+ sector_t sector, sector_t *sector_offset)
+{
+ unsigned int sect_in_chunk;
+ sector_t chunk;
+ struct r0conf *conf = mddev->private;
+ int raid_disks = conf->strip_zone[0].nb_dev;
+ unsigned int chunk_sects = mddev->chunk_sectors;
+
+ if (is_power_of_2(chunk_sects)) {
+ int chunksect_bits = ffz(~chunk_sects);
+ /* find the sector offset inside the chunk */
+ sect_in_chunk = sector & (chunk_sects - 1);
+ sector >>= chunksect_bits;
+ /* chunk in zone */
+ chunk = *sector_offset;
+ /* quotient is the chunk in real device*/
+ sector_div(chunk, zone->nb_dev << chunksect_bits);
+ } else{
+ sect_in_chunk = sector_div(sector, chunk_sects);
+ chunk = *sector_offset;
+ sector_div(chunk, chunk_sects * zone->nb_dev);
+ }
+ /*
+ * position the bio over the real device
+ * real sector = chunk in device + starting of zone
+ * + the position in the chunk
+ */
+ *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+ return conf->devlist[(zone - conf->strip_zone)*raid_disks
+ + sector_div(sector, zone->nb_dev)];
+}
+
+/**
+ * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
+ * @mddev: the md device
+ * @bvm: properties of new bio
+ * @biovec: the request that could be merged to it.
+ *
+ * Return amount of bytes we can accept at this offset
+ */
+static int raid0_mergeable_bvec(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct r0conf *conf = mddev->private;
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ sector_t sector_offset = sector;
+ int max;
+ unsigned int chunk_sectors = mddev->chunk_sectors;
+ unsigned int bio_sectors = bvm->bi_size >> 9;
+ struct strip_zone *zone;
+ struct md_rdev *rdev;
+ struct request_queue *subq;
+
+ if (is_power_of_2(chunk_sectors))
+ max = (chunk_sectors - ((sector & (chunk_sectors-1))
+ + bio_sectors)) << 9;
+ else
+ max = (chunk_sectors - (sector_div(sector, chunk_sectors)
+ + bio_sectors)) << 9;
+ if (max < 0)
+ max = 0; /* bio_add cannot handle a negative return */
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ if (max < biovec->bv_len)
+ /* too small already, no need to check further */
+ return max;
+ if (!conf->has_merge_bvec)
+ return max;
+
+ /* May need to check subordinate device */
+ sector = sector_offset;
+ zone = find_zone(mddev->private, &sector_offset);
+ rdev = map_sector(mddev, zone, sector, &sector_offset);
+ subq = bdev_get_queue(rdev->bdev);
+ if (subq->merge_bvec_fn) {
+ bvm->bi_bdev = rdev->bdev;
+ bvm->bi_sector = sector_offset + zone->dev_start +
+ rdev->data_offset;
+ return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
+ } else
+ return max;
+}
+
+static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ sector_t array_sectors = 0;
+ struct md_rdev *rdev;
+
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+
+ rdev_for_each(rdev, mddev)
+ array_sectors += (rdev->sectors &
+ ~(sector_t)(mddev->chunk_sectors-1));
+
+ return array_sectors;
+}
+
+static void raid0_free(struct mddev *mddev, void *priv);
+
+static int raid0_run(struct mddev *mddev)
+{
+ struct r0conf *conf;
+ int ret;
+
+ if (mddev->chunk_sectors == 0) {
+ printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ if (md_check_no_bitmap(mddev))
+ return -EINVAL;
+
+ if (mddev->queue) {
+ blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+ }
+
+ /* if private is not null, we are here after takeover */
+ if (mddev->private == NULL) {
+ ret = create_strip_zones(mddev, &conf);
+ if (ret < 0)
+ return ret;
+ mddev->private = conf;
+ }
+ conf = mddev->private;
+
+ /* calculate array device size */
+ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
+
+ printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
+ mdname(mddev),
+ (unsigned long long)mddev->array_sectors);
+
+ if (mddev->queue) {
+ /* calculate the max read-ahead size.
+ * For read-ahead of large files to be effective, we need to
+ * readahead at least twice a whole stripe. i.e. number of devices
+ * multiplied by chunk size times 2.
+ * If an individual device has an ra_pages greater than the
+ * chunk size, then we will not drive that device as hard as it
+ * wants. We consider this a configuration error: a larger
+ * chunksize should be used in that case.
+ */
+ int stripe = mddev->raid_disks *
+ (mddev->chunk_sectors << 9) / PAGE_SIZE;
+ if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
+ mddev->queue->backing_dev_info.ra_pages = 2* stripe;
+ }
+
+ dump_zones(mddev);
+
+ ret = md_integrity_register(mddev);
+
+ return ret;
+}
+
+static void raid0_free(struct mddev *mddev, void *priv)
+{
+ struct r0conf *conf = priv;
+
+ kfree(conf->strip_zone);
+ kfree(conf->devlist);
+ kfree(conf);
+}
+
+/*
+ * Is io distribute over 1 or more chunks ?
+*/
+static inline int is_io_in_chunk_boundary(struct mddev *mddev,
+ unsigned int chunk_sects, struct bio *bio)
+{
+ if (likely(is_power_of_2(chunk_sects))) {
+ return chunk_sects >=
+ ((bio->bi_iter.bi_sector & (chunk_sects-1))
+ + bio_sectors(bio));
+ } else{
+ sector_t sector = bio->bi_iter.bi_sector;
+ return chunk_sects >= (sector_div(sector, chunk_sects)
+ + bio_sectors(bio));
+ }
+}
+
+static void raid0_make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct strip_zone *zone;
+ struct md_rdev *tmp_dev;
+ struct bio *split;
+
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
+ return;
+ }
+
+ do {
+ sector_t sector = bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+
+ unsigned sectors = chunk_sects -
+ (likely(is_power_of_2(chunk_sects))
+ ? (sector & (chunk_sects-1))
+ : sector_div(sector, chunk_sects));
+
+ /* Restore due to sector_div */
+ sector = bio->bi_iter.bi_sector;
+
+ if (sectors < bio_sectors(bio)) {
+ split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
+
+ zone = find_zone(mddev->private, &sector);
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ split->bi_bdev = tmp_dev->bdev;
+ split->bi_iter.bi_sector = sector + zone->dev_start +
+ tmp_dev->data_offset;
+
+ if (unlikely((split->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(split, 0);
+ } else
+ generic_make_request(split);
+ } while (split != bio);
+}
+
+static void raid0_status(struct seq_file *seq, struct mddev *mddev)
+{
+ seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
+ return;
+}
+
+static void *raid0_takeover_raid45(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct r0conf *priv_conf;
+
+ if (mddev->degraded != 1) {
+ printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
+ mdname(mddev),
+ mddev->degraded);
+ return ERR_PTR(-EINVAL);
+ }
+
+ rdev_for_each(rdev, mddev) {
+ /* check slot number for a disk */
+ if (rdev->raid_disk == mddev->raid_disks-1) {
+ printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+ rdev->sectors = mddev->dev_sectors;
+ }
+
+ /* Set new parameters */
+ mddev->new_level = 0;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->raid_disks--;
+ mddev->delta_disks = -1;
+ /* make sure it will be not marked as dirty */
+ mddev->recovery_cp = MaxSector;
+
+ create_strip_zones(mddev, &priv_conf);
+ return priv_conf;
+}
+
+static void *raid0_takeover_raid10(struct mddev *mddev)
+{
+ struct r0conf *priv_conf;
+
+ /* Check layout:
+ * - far_copies must be 1
+ * - near_copies must be 2
+ * - disks number must be even
+ * - all mirrors must be already degraded
+ */
+ if (mddev->layout != ((1 << 8) + 2)) {
+ printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n",
+ mdname(mddev),
+ mddev->layout);
+ return ERR_PTR(-EINVAL);
+ }
+ if (mddev->raid_disks & 1) {
+ printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+ if (mddev->degraded != (mddev->raid_disks>>1)) {
+ printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Set new parameters */
+ mddev->new_level = 0;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->delta_disks = - mddev->raid_disks / 2;
+ mddev->raid_disks += mddev->delta_disks;
+ mddev->degraded = 0;
+ /* make sure it will be not marked as dirty */
+ mddev->recovery_cp = MaxSector;
+
+ create_strip_zones(mddev, &priv_conf);
+ return priv_conf;
+}
+
+static void *raid0_takeover_raid1(struct mddev *mddev)
+{
+ struct r0conf *priv_conf;
+ int chunksect;
+
+ /* Check layout:
+ * - (N - 1) mirror drives must be already faulty
+ */
+ if ((mddev->raid_disks - 1) != mddev->degraded) {
+ printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * a raid1 doesn't have the notion of chunk size, so
+ * figure out the largest suitable size we can use.
+ */
+ chunksect = 64 * 2; /* 64K by default */
+
+ /* The array must be an exact multiple of chunksize */
+ while (chunksect && (mddev->array_sectors & (chunksect - 1)))
+ chunksect >>= 1;
+
+ if ((chunksect << 9) < PAGE_SIZE)
+ /* array size does not allow a suitable chunk size */
+ return ERR_PTR(-EINVAL);
+
+ /* Set new parameters */
+ mddev->new_level = 0;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = chunksect;
+ mddev->chunk_sectors = chunksect;
+ mddev->delta_disks = 1 - mddev->raid_disks;
+ mddev->raid_disks = 1;
+ /* make sure it will be not marked as dirty */
+ mddev->recovery_cp = MaxSector;
+
+ create_strip_zones(mddev, &priv_conf);
+ return priv_conf;
+}
+
+static void *raid0_takeover(struct mddev *mddev)
+{
+ /* raid0 can take over:
+ * raid4 - if all data disks are active.
+ * raid5 - providing it is Raid4 layout and one disk is faulty
+ * raid10 - assuming we have all necessary active disks
+ * raid1 - with (N -1) mirror drives faulty
+ */
+
+ if (mddev->bitmap) {
+ printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n",
+ mdname(mddev));
+ return ERR_PTR(-EBUSY);
+ }
+ if (mddev->level == 4)
+ return raid0_takeover_raid45(mddev);
+
+ if (mddev->level == 5) {
+ if (mddev->layout == ALGORITHM_PARITY_N)
+ return raid0_takeover_raid45(mddev);
+
+ printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
+ mdname(mddev), ALGORITHM_PARITY_N);
+ }
+
+ if (mddev->level == 10)
+ return raid0_takeover_raid10(mddev);
+
+ if (mddev->level == 1)
+ return raid0_takeover_raid1(mddev);
+
+ printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
+ mddev->level);
+
+ return ERR_PTR(-EINVAL);
+}
+
+static void raid0_quiesce(struct mddev *mddev, int state)
+{
+}
+
+static struct md_personality raid0_personality=
+{
+ .name = "raid0",
+ .level = 0,
+ .owner = THIS_MODULE,
+ .make_request = raid0_make_request,
+ .run = raid0_run,
+ .free = raid0_free,
+ .status = raid0_status,
+ .size = raid0_size,
+ .takeover = raid0_takeover,
+ .quiesce = raid0_quiesce,
+ .congested = raid0_congested,
+ .mergeable_bvec = raid0_mergeable_bvec,
+};
+
+static int __init raid0_init (void)
+{
+ return register_md_personality (&raid0_personality);
+}
+
+static void raid0_exit (void)
+{
+ unregister_md_personality (&raid0_personality);
+}
+
+module_init(raid0_init);
+module_exit(raid0_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID0 (striping) personality for MD");
+MODULE_ALIAS("md-personality-2"); /* RAID0 */
+MODULE_ALIAS("md-raid0");
+MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
new file mode 100644
index 000000000..05539d9c9
--- /dev/null
+++ b/drivers/md/raid0.h
@@ -0,0 +1,19 @@
+#ifndef _RAID0_H
+#define _RAID0_H
+
+struct strip_zone {
+ sector_t zone_end; /* Start of the next zone (in sectors) */
+ sector_t dev_start; /* Zone offset in real dev (in sectors) */
+ int nb_dev; /* # of devices attached to the zone */
+};
+
+struct r0conf {
+ struct strip_zone *strip_zone;
+ struct md_rdev **devlist; /* lists of rdevs, pointed to
+ * by strip_zone->dev */
+ int nr_strip_zones;
+ int has_merge_bvec; /* at least one member has
+ * a merge_bvec_fn */
+};
+
+#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
new file mode 100644
index 000000000..9157a29c8
--- /dev/null
+++ b/drivers/md/raid1.c
@@ -0,0 +1,3197 @@
+/*
+ * raid1.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * RAID-1 management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob Ă˜stergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
+ * bitmapped intelligence in resync:
+ *
+ * - bitmap marked during normal i/o
+ * - bitmap used to skip nondirty blocks during sync
+ *
+ * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
+ * - persistent bitmap code
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/ratelimit.h>
+#include "md.h"
+#include "raid1.h"
+#include "bitmap.h"
+
+/*
+ * Number of guaranteed r1bios in case of extreme VM load:
+ */
+#define NR_RAID1_BIOS 256
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
+/* When there are this many requests queue to be written by
+ * the raid1 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
+
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+ sector_t bi_sector);
+static void lower_barrier(struct r1conf *conf);
+
+static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
+{
+ struct pool_info *pi = data;
+ int size = offsetof(struct r1bio, bios[pi->raid_disks]);
+
+ /* allocate a r1bio with room for raid_disks entries in the bios array */
+ return kzalloc(size, gfp_flags);
+}
+
+static void r1bio_pool_free(void *r1_bio, void *data)
+{
+ kfree(r1_bio);
+}
+
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_DEPTH 32
+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
+#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
+
+static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
+{
+ struct pool_info *pi = data;
+ struct r1bio *r1_bio;
+ struct bio *bio;
+ int need_pages;
+ int i, j;
+
+ r1_bio = r1bio_pool_alloc(gfp_flags, pi);
+ if (!r1_bio)
+ return NULL;
+
+ /*
+ * Allocate bios : 1 for reading, n-1 for writing
+ */
+ for (j = pi->raid_disks ; j-- ; ) {
+ bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
+ if (!bio)
+ goto out_free_bio;
+ r1_bio->bios[j] = bio;
+ }
+ /*
+ * Allocate RESYNC_PAGES data pages and attach them to
+ * the first bio.
+ * If this is a user-requested check/repair, allocate
+ * RESYNC_PAGES for each bio.
+ */
+ if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
+ need_pages = pi->raid_disks;
+ else
+ need_pages = 1;
+ for (j = 0; j < need_pages; j++) {
+ bio = r1_bio->bios[j];
+ bio->bi_vcnt = RESYNC_PAGES;
+
+ if (bio_alloc_pages(bio, gfp_flags))
+ goto out_free_pages;
+ }
+ /* If not user-requests, copy the page pointers to all bios */
+ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
+ for (i=0; i<RESYNC_PAGES ; i++)
+ for (j=1; j<pi->raid_disks; j++)
+ r1_bio->bios[j]->bi_io_vec[i].bv_page =
+ r1_bio->bios[0]->bi_io_vec[i].bv_page;
+ }
+
+ r1_bio->master_bio = NULL;
+
+ return r1_bio;
+
+out_free_pages:
+ while (--j >= 0) {
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, r1_bio->bios[j], i)
+ __free_page(bv->bv_page);
+ }
+
+out_free_bio:
+ while (++j < pi->raid_disks)
+ bio_put(r1_bio->bios[j]);
+ r1bio_pool_free(r1_bio, data);
+ return NULL;
+}
+
+static void r1buf_pool_free(void *__r1_bio, void *data)
+{
+ struct pool_info *pi = data;
+ int i,j;
+ struct r1bio *r1bio = __r1_bio;
+
+ for (i = 0; i < RESYNC_PAGES; i++)
+ for (j = pi->raid_disks; j-- ;) {
+ if (j == 0 ||
+ r1bio->bios[j]->bi_io_vec[i].bv_page !=
+ r1bio->bios[0]->bi_io_vec[i].bv_page)
+ safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+ }
+ for (i=0 ; i < pi->raid_disks; i++)
+ bio_put(r1bio->bios[i]);
+
+ r1bio_pool_free(r1bio, data);
+}
+
+static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
+{
+ int i;
+
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+ struct bio **bio = r1_bio->bios + i;
+ if (!BIO_SPECIAL(*bio))
+ bio_put(*bio);
+ *bio = NULL;
+ }
+}
+
+static void free_r1bio(struct r1bio *r1_bio)
+{
+ struct r1conf *conf = r1_bio->mddev->private;
+
+ put_all_bios(conf, r1_bio);
+ mempool_free(r1_bio, conf->r1bio_pool);
+}
+
+static void put_buf(struct r1bio *r1_bio)
+{
+ struct r1conf *conf = r1_bio->mddev->private;
+ int i;
+
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+ struct bio *bio = r1_bio->bios[i];
+ if (bio->bi_end_io)
+ rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
+ }
+
+ mempool_free(r1_bio, conf->r1buf_pool);
+
+ lower_barrier(conf);
+}
+
+static void reschedule_retry(struct r1bio *r1_bio)
+{
+ unsigned long flags;
+ struct mddev *mddev = r1_bio->mddev;
+ struct r1conf *conf = mddev->private;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ list_add(&r1_bio->retry_list, &conf->retry_list);
+ conf->nr_queued ++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ wake_up(&conf->wait_barrier);
+ md_wakeup_thread(mddev->thread);
+}
+
+/*
+ * raid_end_bio_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void call_bio_endio(struct r1bio *r1_bio)
+{
+ struct bio *bio = r1_bio->master_bio;
+ int done;
+ struct r1conf *conf = r1_bio->mddev->private;
+ sector_t start_next_window = r1_bio->start_next_window;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
+
+ if (bio->bi_phys_segments) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio->bi_phys_segments--;
+ done = (bio->bi_phys_segments == 0);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /*
+ * make_request() might be waiting for
+ * bi_phys_segments to decrease
+ */
+ wake_up(&conf->wait_barrier);
+ } else
+ done = 1;
+
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (done) {
+ bio_endio(bio, 0);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf, start_next_window, bi_sector);
+ }
+}
+
+static void raid_end_bio_io(struct r1bio *r1_bio)
+{
+ struct bio *bio = r1_bio->master_bio;
+
+ /* if nobody has done the final endio yet, do it now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
+ (bio_data_dir(bio) == WRITE) ? "write" : "read",
+ (unsigned long long) bio->bi_iter.bi_sector,
+ (unsigned long long) bio_end_sector(bio) - 1);
+
+ call_bio_endio(r1_bio);
+ }
+ free_r1bio(r1_bio);
+}
+
+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int disk, struct r1bio *r1_bio)
+{
+ struct r1conf *conf = r1_bio->mddev->private;
+
+ conf->mirrors[disk].head_position =
+ r1_bio->sector + (r1_bio->sectors);
+}
+
+/*
+ * Find the disk number which triggered given bio
+ */
+static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
+{
+ int mirror;
+ struct r1conf *conf = r1_bio->mddev->private;
+ int raid_disks = conf->raid_disks;
+
+ for (mirror = 0; mirror < raid_disks * 2; mirror++)
+ if (r1_bio->bios[mirror] == bio)
+ break;
+
+ BUG_ON(mirror == raid_disks * 2);
+ update_head_pos(mirror, r1_bio);
+
+ return mirror;
+}
+
+static void raid1_end_read_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct r1bio *r1_bio = bio->bi_private;
+ int mirror;
+ struct r1conf *conf = r1_bio->mddev->private;
+
+ mirror = r1_bio->read_disk;
+ /*
+ * this branch is our 'one mirror IO has finished' event handler:
+ */
+ update_head_pos(mirror, r1_bio);
+
+ if (uptodate)
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ else {
+ /* If all other devices have failed, we want to return
+ * the error upwards rather than fail the last device.
+ * Here we redefine "uptodate" to mean "Don't want to retry"
+ */
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (r1_bio->mddev->degraded == conf->raid_disks ||
+ (r1_bio->mddev->degraded == conf->raid_disks-1 &&
+ !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+ uptodate = 1;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+
+ if (uptodate) {
+ raid_end_bio_io(r1_bio);
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+ } else {
+ /*
+ * oops, read error:
+ */
+ char b[BDEVNAME_SIZE];
+ printk_ratelimited(
+ KERN_ERR "md/raid1:%s: %s: "
+ "rescheduling sector %llu\n",
+ mdname(conf->mddev),
+ bdevname(conf->mirrors[mirror].rdev->bdev,
+ b),
+ (unsigned long long)r1_bio->sector);
+ set_bit(R1BIO_ReadError, &r1_bio->state);
+ reschedule_retry(r1_bio);
+ /* don't drop the reference on read_disk yet */
+ }
+}
+
+static void close_write(struct r1bio *r1_bio)
+{
+ /* it really is the end of this request */
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ /* free extra copy of the data pages */
+ int i = r1_bio->behind_page_count;
+ while (i--)
+ safe_put_page(r1_bio->behind_bvecs[i].bv_page);
+ kfree(r1_bio->behind_bvecs);
+ r1_bio->behind_bvecs = NULL;
+ }
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ md_write_end(r1_bio->mddev);
+}
+
+static void r1_bio_write_done(struct r1bio *r1_bio)
+{
+ if (!atomic_dec_and_test(&r1_bio->remaining))
+ return;
+
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ close_write(r1_bio);
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else
+ raid_end_bio_io(r1_bio);
+ }
+}
+
+static void raid1_end_write_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct r1bio *r1_bio = bio->bi_private;
+ int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
+ struct r1conf *conf = r1_bio->mddev->private;
+ struct bio *to_put = NULL;
+
+ mirror = find_bio_disk(r1_bio, bio);
+
+ /*
+ * 'one mirror IO has finished' event handler:
+ */
+ if (!uptodate) {
+ set_bit(WriteErrorSeen,
+ &conf->mirrors[mirror].rdev->flags);
+ if (!test_and_set_bit(WantReplacement,
+ &conf->mirrors[mirror].rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ conf->mddev->recovery);
+
+ set_bit(R1BIO_WriteError, &r1_bio->state);
+ } else {
+ /*
+ * Set R1BIO_Uptodate in our master bio, so that we
+ * will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer
+ * fails.
+ *
+ * The 'master' represents the composite IO operation
+ * to user-side. So if something waits for IO, then it
+ * will wait for the 'master' bio.
+ */
+ sector_t first_bad;
+ int bad_sectors;
+
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ /*
+ * Do not set R1BIO_Uptodate if the current device is
+ * rebuilding or Faulty. This is because we cannot use
+ * such device for properly reading the data back (we could
+ * potentially use it, if the current write would have felt
+ * before rdev->recovery_offset, but for simplicity we don't
+ * check this here.
+ */
+ if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) &&
+ !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ /* Maybe we can clear some bad blocks. */
+ if (is_badblock(conf->mirrors[mirror].rdev,
+ r1_bio->sector, r1_bio->sectors,
+ &first_bad, &bad_sectors)) {
+ r1_bio->bios[mirror] = IO_MADE_GOOD;
+ set_bit(R1BIO_MadeGood, &r1_bio->state);
+ }
+ }
+
+ if (behind) {
+ if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+ atomic_dec(&r1_bio->behind_remaining);
+
+ /*
+ * In behind mode, we ACK the master bio once the I/O
+ * has safely reached all non-writemostly
+ * disks. Setting the Returned bit ensures that this
+ * gets done only once -- we don't ever want to return
+ * -EIO here, instead we'll wait
+ */
+ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+ test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /* Maybe we can return now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ struct bio *mbio = r1_bio->master_bio;
+ pr_debug("raid1: behind end write sectors"
+ " %llu-%llu\n",
+ (unsigned long long) mbio->bi_iter.bi_sector,
+ (unsigned long long) bio_end_sector(mbio) - 1);
+ call_bio_endio(r1_bio);
+ }
+ }
+ }
+ if (r1_bio->bios[mirror] == NULL)
+ rdev_dec_pending(conf->mirrors[mirror].rdev,
+ conf->mddev);
+
+ /*
+ * Let's see if all mirrored write operations have finished
+ * already.
+ */
+ r1_bio_write_done(r1_bio);
+
+ if (to_put)
+ bio_put(to_put);
+}
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, both the normal and the resync IO
+ * completion handlers update this position correctly. If there is no
+ * perfect sequential match then we pick the disk whose head is closest.
+ *
+ * If there are 2 mirrors in the same 2 devices, performance degrades
+ * because position is mirror, not device based.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
+{
+ const sector_t this_sector = r1_bio->sector;
+ int sectors;
+ int best_good_sectors;
+ int best_disk, best_dist_disk, best_pending_disk;
+ int has_nonrot_disk;
+ int disk;
+ sector_t best_dist;
+ unsigned int min_pending;
+ struct md_rdev *rdev;
+ int choose_first;
+ int choose_next_idle;
+
+ rcu_read_lock();
+ /*
+ * Check if we can balance. We can balance on the whole
+ * device if no resync is going on, or below the resync window.
+ * We take the first readable disk when above the resync window.
+ */
+ retry:
+ sectors = r1_bio->sectors;
+ best_disk = -1;
+ best_dist_disk = -1;
+ best_dist = MaxSector;
+ best_pending_disk = -1;
+ min_pending = UINT_MAX;
+ best_good_sectors = 0;
+ has_nonrot_disk = 0;
+ choose_next_idle = 0;
+
+ if ((conf->mddev->recovery_cp < this_sector + sectors) ||
+ (mddev_is_clustered(conf->mddev) &&
+ md_cluster_ops->area_resyncing(conf->mddev, this_sector,
+ this_sector + sectors)))
+ choose_first = 1;
+ else
+ choose_first = 0;
+
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+ sector_t dist;
+ sector_t first_bad;
+ int bad_sectors;
+ unsigned int pending;
+ bool nonrot;
+
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (r1_bio->bios[disk] == IO_BLOCKED
+ || rdev == NULL
+ || test_bit(Unmerged, &rdev->flags)
+ || test_bit(Faulty, &rdev->flags))
+ continue;
+ if (!test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < this_sector + sectors)
+ continue;
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ /* Don't balance among write-mostly, just
+ * use the first as a last resort */
+ if (best_dist_disk < 0) {
+ if (is_badblock(rdev, this_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (first_bad < this_sector)
+ /* Cannot use this */
+ continue;
+ best_good_sectors = first_bad - this_sector;
+ } else
+ best_good_sectors = sectors;
+ best_dist_disk = disk;
+ best_pending_disk = disk;
+ }
+ continue;
+ }
+ /* This is a reasonable device to use. It might
+ * even be best.
+ */
+ if (is_badblock(rdev, this_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (best_dist < MaxSector)
+ /* already have a better device */
+ continue;
+ if (first_bad <= this_sector) {
+ /* cannot read here. If this is the 'primary'
+ * device, then we must not read beyond
+ * bad_sectors from another device..
+ */
+ bad_sectors -= (this_sector - first_bad);
+ if (choose_first && sectors > bad_sectors)
+ sectors = bad_sectors;
+ if (best_good_sectors > sectors)
+ best_good_sectors = sectors;
+
+ } else {
+ sector_t good_sectors = first_bad - this_sector;
+ if (good_sectors > best_good_sectors) {
+ best_good_sectors = good_sectors;
+ best_disk = disk;
+ }
+ if (choose_first)
+ break;
+ }
+ continue;
+ } else
+ best_good_sectors = sectors;
+
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+ has_nonrot_disk |= nonrot;
+ pending = atomic_read(&rdev->nr_pending);
+ dist = abs(this_sector - conf->mirrors[disk].head_position);
+ if (choose_first) {
+ best_disk = disk;
+ break;
+ }
+ /* Don't change to another disk for sequential reads */
+ if (conf->mirrors[disk].next_seq_sect == this_sector
+ || dist == 0) {
+ int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
+ struct raid1_info *mirror = &conf->mirrors[disk];
+
+ best_disk = disk;
+ /*
+ * If buffered sequential IO size exceeds optimal
+ * iosize, check if there is idle disk. If yes, choose
+ * the idle disk. read_balance could already choose an
+ * idle disk before noticing it's a sequential IO in
+ * this disk. This doesn't matter because this disk
+ * will idle, next time it will be utilized after the
+ * first disk has IO size exceeds optimal iosize. In
+ * this way, iosize of the first disk will be optimal
+ * iosize at least. iosize of the second disk might be
+ * small, but not a big deal since when the second disk
+ * starts IO, the first disk is likely still busy.
+ */
+ if (nonrot && opt_iosize > 0 &&
+ mirror->seq_start != MaxSector &&
+ mirror->next_seq_sect > opt_iosize &&
+ mirror->next_seq_sect - opt_iosize >=
+ mirror->seq_start) {
+ choose_next_idle = 1;
+ continue;
+ }
+ break;
+ }
+ /* If device is idle, use it */
+ if (pending == 0) {
+ best_disk = disk;
+ break;
+ }
+
+ if (choose_next_idle)
+ continue;
+
+ if (min_pending > pending) {
+ min_pending = pending;
+ best_pending_disk = disk;
+ }
+
+ if (dist < best_dist) {
+ best_dist = dist;
+ best_dist_disk = disk;
+ }
+ }
+
+ /*
+ * If all disks are rotational, choose the closest disk. If any disk is
+ * non-rotational, choose the disk with less pending request even the
+ * disk is rotational, which might/might not be optimal for raids with
+ * mixed ratation/non-rotational disks depending on workload.
+ */
+ if (best_disk == -1) {
+ if (has_nonrot_disk)
+ best_disk = best_pending_disk;
+ else
+ best_disk = best_dist_disk;
+ }
+
+ if (best_disk >= 0) {
+ rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
+ if (!rdev)
+ goto retry;
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ /* cannot risk returning a device that failed
+ * before we inc'ed nr_pending
+ */
+ rdev_dec_pending(rdev, conf->mddev);
+ goto retry;
+ }
+ sectors = best_good_sectors;
+
+ if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+ conf->mirrors[best_disk].seq_start = this_sector;
+
+ conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
+ }
+ rcu_read_unlock();
+ *max_sectors = sectors;
+
+ return best_disk;
+}
+
+static int raid1_mergeable_bvec(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct r1conf *conf = mddev->private;
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ int max = biovec->bv_len;
+
+ if (mddev->merge_check_needed) {
+ int disk;
+ rcu_read_lock();
+ for (disk = 0; disk < conf->raid_disks * 2; disk++) {
+ struct md_rdev *rdev = rcu_dereference(
+ conf->mirrors[disk].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = sector +
+ rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ }
+ rcu_read_unlock();
+ }
+ return max;
+
+}
+
+static int raid1_congested(struct mddev *mddev, int bits)
+{
+ struct r1conf *conf = mddev->private;
+ int i, ret = 0;
+
+ if ((bits & (1 << BDI_async_congested)) &&
+ conf->pending_count >= max_queued_requests)
+ return 1;
+
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+ BUG_ON(!q);
+
+ /* Note the '|| 1' - when read_balance prefers
+ * non-congested targets, it can be removed
+ */
+ if ((bits & (1<<BDI_async_congested)) || 1)
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ else
+ ret &= bdi_congested(&q->backing_dev_info, bits);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static void flush_pending_writes(struct r1conf *conf)
+{
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ */
+ spin_lock_irq(&conf->device_lock);
+
+ if (conf->pending_bio_list.head) {
+ struct bio *bio;
+ bio = bio_list_get(&conf->pending_bio_list);
+ conf->pending_count = 0;
+ spin_unlock_irq(&conf->device_lock);
+ /* flush any pending bitmap writes to
+ * disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
+ } else
+ spin_unlock_irq(&conf->device_lock);
+}
+
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down. This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'. When that returns there
+ * is no backgroup IO happening, It must arrange to call
+ * allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier. Once that returns
+ * there is no normal IO happeing. It must arrange to call
+ * lower_barrier when the particular background IO completes.
+ */
+static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+ spin_lock_irq(&conf->resync_lock);
+
+ /* Wait until no block IO is waiting */
+ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+ conf->resync_lock);
+
+ /* block any new IO from starting */
+ conf->barrier++;
+ conf->next_resync = sector_nr;
+
+ /* For these conditions we must wait:
+ * A: while the array is in frozen state
+ * B: while barrier >= RESYNC_DEPTH, meaning resync reach
+ * the max count which allowed.
+ * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
+ * next resync will reach to the window which normal bios are
+ * handling.
+ * D: while there are any active requests in the current window.
+ */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->array_frozen &&
+ conf->barrier < RESYNC_DEPTH &&
+ conf->current_window_requests == 0 &&
+ (conf->start_next_window >=
+ conf->next_resync + RESYNC_SECTORS),
+ conf->resync_lock);
+
+ conf->nr_pending++;
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(struct r1conf *conf)
+{
+ unsigned long flags;
+ BUG_ON(conf->barrier <= 0);
+ spin_lock_irqsave(&conf->resync_lock, flags);
+ conf->barrier--;
+ conf->nr_pending--;
+ spin_unlock_irqrestore(&conf->resync_lock, flags);
+ wake_up(&conf->wait_barrier);
+}
+
+static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
+{
+ bool wait = false;
+
+ if (conf->array_frozen || !bio)
+ wait = true;
+ else if (conf->barrier && bio_data_dir(bio) == WRITE) {
+ if ((conf->mddev->curr_resync_completed
+ >= bio_end_sector(bio)) ||
+ (conf->next_resync + NEXT_NORMALIO_DISTANCE
+ <= bio->bi_iter.bi_sector))
+ wait = false;
+ else
+ wait = true;
+ }
+
+ return wait;
+}
+
+static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+{
+ sector_t sector = 0;
+
+ spin_lock_irq(&conf->resync_lock);
+ if (need_to_wait_for_sync(conf, bio)) {
+ conf->nr_waiting++;
+ /* Wait for the barrier to drop.
+ * However if there are already pending
+ * requests (preventing the barrier from
+ * rising completely), and the
+ * per-process bio queue isn't empty,
+ * then don't wait, as we need to empty
+ * that queue to allow conf->start_next_window
+ * to increase.
+ */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->array_frozen &&
+ (!conf->barrier ||
+ ((conf->start_next_window <
+ conf->next_resync + RESYNC_SECTORS) &&
+ current->bio_list &&
+ !bio_list_empty(current->bio_list))),
+ conf->resync_lock);
+ conf->nr_waiting--;
+ }
+
+ if (bio && bio_data_dir(bio) == WRITE) {
+ if (bio->bi_iter.bi_sector >=
+ conf->mddev->curr_resync_completed) {
+ if (conf->start_next_window == MaxSector)
+ conf->start_next_window =
+ conf->next_resync +
+ NEXT_NORMALIO_DISTANCE;
+
+ if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
+ <= bio->bi_iter.bi_sector)
+ conf->next_window_requests++;
+ else
+ conf->current_window_requests++;
+ sector = conf->start_next_window;
+ }
+ }
+
+ conf->nr_pending++;
+ spin_unlock_irq(&conf->resync_lock);
+ return sector;
+}
+
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+ sector_t bi_sector)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conf->resync_lock, flags);
+ conf->nr_pending--;
+ if (start_next_window) {
+ if (start_next_window == conf->start_next_window) {
+ if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+ <= bi_sector)
+ conf->next_window_requests--;
+ else
+ conf->current_window_requests--;
+ } else
+ conf->current_window_requests--;
+
+ if (!conf->current_window_requests) {
+ if (conf->next_window_requests) {
+ conf->current_window_requests =
+ conf->next_window_requests;
+ conf->next_window_requests = 0;
+ conf->start_next_window +=
+ NEXT_NORMALIO_DISTANCE;
+ } else
+ conf->start_next_window = MaxSector;
+ }
+ }
+ spin_unlock_irqrestore(&conf->resync_lock, flags);
+ wake_up(&conf->wait_barrier);
+}
+
+static void freeze_array(struct r1conf *conf, int extra)
+{
+ /* stop syncio and normal IO and wait for everything to
+ * go quite.
+ * We wait until nr_pending match nr_queued+extra
+ * This is called in the context of one normal IO request
+ * that has failed. Thus any sync request that might be pending
+ * will be blocked by nr_pending, and we need to wait for
+ * pending IO requests to complete or be queued for re-try.
+ * Thus the number queued (nr_queued) plus this request (extra)
+ * must match the number of pending IOs (nr_pending) before
+ * we continue.
+ */
+ spin_lock_irq(&conf->resync_lock);
+ conf->array_frozen = 1;
+ wait_event_lock_irq_cmd(conf->wait_barrier,
+ conf->nr_pending == conf->nr_queued+extra,
+ conf->resync_lock,
+ flush_pending_writes(conf));
+ spin_unlock_irq(&conf->resync_lock);
+}
+static void unfreeze_array(struct r1conf *conf)
+{
+ /* reverse the effect of the freeze */
+ spin_lock_irq(&conf->resync_lock);
+ conf->array_frozen = 0;
+ wake_up(&conf->wait_barrier);
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+/* duplicate the data pages for behind I/O
+ */
+static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
+{
+ int i;
+ struct bio_vec *bvec;
+ struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
+ GFP_NOIO);
+ if (unlikely(!bvecs))
+ return;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ bvecs[i] = *bvec;
+ bvecs[i].bv_page = alloc_page(GFP_NOIO);
+ if (unlikely(!bvecs[i].bv_page))
+ goto do_sync_io;
+ memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
+ kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+ kunmap(bvecs[i].bv_page);
+ kunmap(bvec->bv_page);
+ }
+ r1_bio->behind_bvecs = bvecs;
+ r1_bio->behind_page_count = bio->bi_vcnt;
+ set_bit(R1BIO_BehindIO, &r1_bio->state);
+ return;
+
+do_sync_io:
+ for (i = 0; i < bio->bi_vcnt; i++)
+ if (bvecs[i].bv_page)
+ put_page(bvecs[i].bv_page);
+ kfree(bvecs);
+ pr_debug("%dB behind alloc failed, doing sync I/O\n",
+ bio->bi_iter.bi_size);
+}
+
+struct raid1_plug_cb {
+ struct blk_plug_cb cb;
+ struct bio_list pending;
+ int pending_cnt;
+};
+
+static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
+ cb);
+ struct mddev *mddev = plug->cb.data;
+ struct r1conf *conf = mddev->private;
+ struct bio *bio;
+
+ if (from_schedule || current->bio_list) {
+ spin_lock_irq(&conf->device_lock);
+ bio_list_merge(&conf->pending_bio_list, &plug->pending);
+ conf->pending_count += plug->pending_cnt;
+ spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
+ md_wakeup_thread(mddev->thread);
+ kfree(plug);
+ return;
+ }
+
+ /* we aren't scheduling, so we can do the write-out directly. */
+ bio = bio_list_get(&plug->pending);
+ bitmap_unplug(mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
+ kfree(plug);
+}
+
+static void make_request(struct mddev *mddev, struct bio * bio)
+{
+ struct r1conf *conf = mddev->private;
+ struct raid1_info *mirror;
+ struct r1bio *r1_bio;
+ struct bio *read_bio;
+ int i, disks;
+ struct bitmap *bitmap;
+ unsigned long flags;
+ const int rw = bio_data_dir(bio);
+ const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
+ const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+ const unsigned long do_discard = (bio->bi_rw
+ & (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
+ struct md_rdev *blocked_rdev;
+ struct blk_plug_cb *cb;
+ struct raid1_plug_cb *plug = NULL;
+ int first_clone;
+ int sectors_handled;
+ int max_sectors;
+ sector_t start_next_window;
+
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
+
+ md_write_start(mddev, bio); /* wait on superblock update early */
+
+ if (bio_data_dir(bio) == WRITE &&
+ ((bio_end_sector(bio) > mddev->suspend_lo &&
+ bio->bi_iter.bi_sector < mddev->suspend_hi) ||
+ (mddev_is_clustered(mddev) &&
+ md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
+ /* As the suspend_* range is controlled by
+ * userspace, we want an interruptible
+ * wait.
+ */
+ DEFINE_WAIT(w);
+ for (;;) {
+ flush_signals(current);
+ prepare_to_wait(&conf->wait_barrier,
+ &w, TASK_INTERRUPTIBLE);
+ if (bio_end_sector(bio) <= mddev->suspend_lo ||
+ bio->bi_iter.bi_sector >= mddev->suspend_hi ||
+ (mddev_is_clustered(mddev) &&
+ !md_cluster_ops->area_resyncing(mddev,
+ bio->bi_iter.bi_sector, bio_end_sector(bio))))
+ break;
+ schedule();
+ }
+ finish_wait(&conf->wait_barrier, &w);
+ }
+
+ start_next_window = wait_barrier(conf, bio);
+
+ bitmap = mddev->bitmap;
+
+ /*
+ * make_request() can abort the operation when READA is being
+ * used and no empty request is available.
+ *
+ */
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio_sectors(bio);
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_iter.bi_sector;
+
+ /* We might need to issue multiple reads to different
+ * devices if there are bad blocks around, so we keep
+ * track of the number of reads in bio->bi_phys_segments.
+ * If this is 0, there is only one r1_bio and no locking
+ * will be needed when requests complete. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+ if (rw == READ) {
+ /*
+ * read balancing logic:
+ */
+ int rdisk;
+
+read_again:
+ rdisk = read_balance(conf, r1_bio, &max_sectors);
+
+ if (rdisk < 0) {
+ /* couldn't find anywhere to read from */
+ raid_end_bio_io(r1_bio);
+ return;
+ }
+ mirror = conf->mirrors + rdisk;
+
+ if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+ bitmap) {
+ /* Reading from a write-mostly device must
+ * take care not to over-take any writes
+ * that are 'behind'
+ */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+ r1_bio->read_disk = rdisk;
+ r1_bio->start_next_window = 0;
+
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+
+ r1_bio->bios[rdisk] = read_bio;
+
+ read_bio->bi_iter.bi_sector = r1_bio->sector +
+ mirror->rdev->data_offset;
+ read_bio->bi_bdev = mirror->rdev->bdev;
+ read_bio->bi_end_io = raid1_end_read_request;
+ read_bio->bi_rw = READ | do_sync;
+ read_bio->bi_private = r1_bio;
+
+ if (max_sectors < r1_bio->sectors) {
+ /* could not read all from this device, so we will
+ * need another r1_bio.
+ */
+
+ sectors_handled = (r1_bio->sector + max_sectors
+ - bio->bi_iter.bi_sector);
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ /* Cannot call generic_make_request directly
+ * as that will be queued in __make_request
+ * and subsequent mempool_alloc might block waiting
+ * for it. So hand bio over to raid1d.
+ */
+ reschedule_retry(r1_bio);
+
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_iter.bi_sector +
+ sectors_handled;
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
+ return;
+ }
+
+ /*
+ * WRITE:
+ */
+ if (conf->pending_count >= max_queued_requests) {
+ md_wakeup_thread(mddev->thread);
+ wait_event(conf->wait_barrier,
+ conf->pending_count < max_queued_requests);
+ }
+ /* first select target devices under rcu_lock and
+ * inc refcount on their rdev. Record them by setting
+ * bios[x] to bio
+ * If there are known/acknowledged bad blocks on any device on
+ * which we have seen a write error, we want to avoid writing those
+ * blocks.
+ * This potentially requires several writes to write around
+ * the bad blocks. Each set of writes gets it's own r1bio
+ * with a set of bios attached.
+ */
+
+ disks = conf->raid_disks * 2;
+ retry_write:
+ r1_bio->start_next_window = start_next_window;
+ blocked_rdev = NULL;
+ rcu_read_lock();
+ max_sectors = r1_bio->sectors;
+ for (i = 0; i < disks; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+ atomic_inc(&rdev->nr_pending);
+ blocked_rdev = rdev;
+ break;
+ }
+ r1_bio->bios[i] = NULL;
+ if (!rdev || test_bit(Faulty, &rdev->flags)
+ || test_bit(Unmerged, &rdev->flags)) {
+ if (i < conf->raid_disks)
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ continue;
+ }
+
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ sector_t first_bad;
+ int bad_sectors;
+ int is_bad;
+
+ is_bad = is_badblock(rdev, r1_bio->sector,
+ max_sectors,
+ &first_bad, &bad_sectors);
+ if (is_bad < 0) {
+ /* mustn't write here until the bad block is
+ * acknowledged*/
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (is_bad && first_bad <= r1_bio->sector) {
+ /* Cannot write here at all */
+ bad_sectors -= (r1_bio->sector - first_bad);
+ if (bad_sectors < max_sectors)
+ /* mustn't write more than bad_sectors
+ * to other devices yet
+ */
+ max_sectors = bad_sectors;
+ rdev_dec_pending(rdev, mddev);
+ /* We don't set R1BIO_Degraded as that
+ * only applies if the disk is
+ * missing, so it might be re-added,
+ * and we want to know to recover this
+ * chunk.
+ * In this case the device is here,
+ * and the fact that this chunk is not
+ * in-sync is recorded in the bad
+ * block log
+ */
+ continue;
+ }
+ if (is_bad) {
+ int good_sectors = first_bad - r1_bio->sector;
+ if (good_sectors < max_sectors)
+ max_sectors = good_sectors;
+ }
+ }
+ r1_bio->bios[i] = bio;
+ }
+ rcu_read_unlock();
+
+ if (unlikely(blocked_rdev)) {
+ /* Wait for this device to become unblocked */
+ int j;
+ sector_t old = start_next_window;
+
+ for (j = 0; j < i; j++)
+ if (r1_bio->bios[j])
+ rdev_dec_pending(conf->mirrors[j].rdev, mddev);
+ r1_bio->state = 0;
+ allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
+ md_wait_for_blocked_rdev(blocked_rdev, mddev);
+ start_next_window = wait_barrier(conf, bio);
+ /*
+ * We must make sure the multi r1bios of bio have
+ * the same value of bi_phys_segments
+ */
+ if (bio->bi_phys_segments && old &&
+ old != start_next_window)
+ /* Wait for the former r1bio(s) to complete */
+ wait_event(conf->wait_barrier,
+ bio->bi_phys_segments == 1);
+ goto retry_write;
+ }
+
+ if (max_sectors < r1_bio->sectors) {
+ /* We are splitting this write into multiple parts, so
+ * we need to prepare for allocating another r1_bio.
+ */
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ }
+ sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
+
+ atomic_set(&r1_bio->remaining, 1);
+ atomic_set(&r1_bio->behind_remaining, 0);
+
+ first_clone = 1;
+ for (i = 0; i < disks; i++) {
+ struct bio *mbio;
+ if (!r1_bio->bios[i])
+ continue;
+
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
+
+ if (first_clone) {
+ /* do behind I/O ?
+ * Not if there are too many, or cannot
+ * allocate memory, or a reader on WriteMostly
+ * is waiting for behind writes to flush */
+ if (bitmap &&
+ (atomic_read(&bitmap->behind_writes)
+ < mddev->bitmap_info.max_write_behind) &&
+ !waitqueue_active(&bitmap->behind_wait))
+ alloc_behind_pages(mbio, r1_bio);
+
+ bitmap_startwrite(bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ test_bit(R1BIO_BehindIO,
+ &r1_bio->state));
+ first_clone = 0;
+ }
+ if (r1_bio->behind_bvecs) {
+ struct bio_vec *bvec;
+ int j;
+
+ /*
+ * We trimmed the bio, so _all is legit
+ */
+ bio_for_each_segment_all(bvec, mbio, j)
+ bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
+ if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+ atomic_inc(&r1_bio->behind_remaining);
+ }
+
+ r1_bio->bios[i] = mbio;
+
+ mbio->bi_iter.bi_sector = (r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset);
+ mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ mbio->bi_end_io = raid1_end_write_request;
+ mbio->bi_rw =
+ WRITE | do_flush_fua | do_sync | do_discard | do_same;
+ mbio->bi_private = r1_bio;
+
+ atomic_inc(&r1_bio->remaining);
+
+ cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid1_plug_cb, cb);
+ else
+ plug = NULL;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!plug)
+ md_wakeup_thread(mddev->thread);
+ }
+ /* Mustn't call r1_bio_write_done before this next test,
+ * as it could result in the bio being freed.
+ */
+ if (sectors_handled < bio_sectors(bio)) {
+ r1_bio_write_done(r1_bio);
+ /* We need another r1_bio. It has already been counted
+ * in bio->bi_phys_segments
+ */
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+ goto retry_write;
+ }
+
+ r1_bio_write_done(r1_bio);
+
+ /* In case raid1d snuck in to freeze_array */
+ wake_up(&conf->wait_barrier);
+}
+
+static void status(struct seq_file *seq, struct mddev *mddev)
+{
+ struct r1conf *conf = mddev->private;
+ int i;
+
+ seq_printf(seq, " [%d/%d] [", conf->raid_disks,
+ conf->raid_disks - mddev->degraded);
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ seq_printf(seq, "%s",
+ rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
+ }
+ rcu_read_unlock();
+ seq_printf(seq, "]");
+}
+
+static void error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ struct r1conf *conf = mddev->private;
+
+ /*
+ * If it is not operational, then we have already marked it as dead
+ * else if it is the last working disks, ignore the error, let the
+ * next level up know.
+ * else mark the drive as failed
+ */
+ if (test_bit(In_sync, &rdev->flags)
+ && (conf->raid_disks - mddev->degraded) == 1) {
+ /*
+ * Don't fail the drive, act as though we were just a
+ * normal single drive.
+ * However don't try a recovery from this drive as
+ * it is very likely to fail.
+ */
+ conf->recovery_disabled = mddev->recovery_disabled;
+ return;
+ }
+ set_bit(Blocked, &rdev->flags);
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded++;
+ set_bit(Faulty, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ } else
+ set_bit(Faulty, &rdev->flags);
+ /*
+ * if recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ printk(KERN_ALERT
+ "md/raid1:%s: Disk failure on %s, disabling device.\n"
+ "md/raid1:%s: Operation continuing on %d devices.\n",
+ mdname(mddev), bdevname(rdev->bdev, b),
+ mdname(mddev), conf->raid_disks - mddev->degraded);
+}
+
+static void print_conf(struct r1conf *conf)
+{
+ int i;
+
+ printk(KERN_DEBUG "RAID1 conf printout:\n");
+ if (!conf) {
+ printk(KERN_DEBUG "(!conf)\n");
+ return;
+ }
+ printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+ conf->raid_disks);
+
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ char b[BDEVNAME_SIZE];
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev)
+ printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
+ i, !test_bit(In_sync, &rdev->flags),
+ !test_bit(Faulty, &rdev->flags),
+ bdevname(rdev->bdev,b));
+ }
+ rcu_read_unlock();
+}
+
+static void close_sync(struct r1conf *conf)
+{
+ wait_barrier(conf, NULL);
+ allow_barrier(conf, 0, 0);
+
+ mempool_destroy(conf->r1buf_pool);
+ conf->r1buf_pool = NULL;
+
+ spin_lock_irq(&conf->resync_lock);
+ conf->next_resync = 0;
+ conf->start_next_window = MaxSector;
+ conf->current_window_requests +=
+ conf->next_window_requests;
+ conf->next_window_requests = 0;
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static int raid1_spare_active(struct mddev *mddev)
+{
+ int i;
+ struct r1conf *conf = mddev->private;
+ int count = 0;
+ unsigned long flags;
+
+ /*
+ * Find all failed disks within the RAID1 configuration
+ * and mark them readable.
+ * Called under mddev lock, so rcu protection not needed.
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+ struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
+ if (repl
+ && !test_bit(Candidate, &repl->flags)
+ && repl->recovery_offset == MaxSector
+ && !test_bit(Faulty, &repl->flags)
+ && !test_and_set_bit(In_sync, &repl->flags)) {
+ /* replacement has just become active */
+ if (!rdev ||
+ !test_and_clear_bit(In_sync, &rdev->flags))
+ count++;
+ if (rdev) {
+ /* Replaced device not technically
+ * faulty, but we need to be sure
+ * it gets removed and never re-added
+ */
+ set_bit(Faulty, &rdev->flags);
+ sysfs_notify_dirent_safe(
+ rdev->sysfs_state);
+ }
+ }
+ if (rdev
+ && rdev->recovery_offset == MaxSector
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_and_set_bit(In_sync, &rdev->flags)) {
+ count++;
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ }
+ }
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded -= count;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ print_conf(conf);
+ return count;
+}
+
+static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct r1conf *conf = mddev->private;
+ int err = -EEXIST;
+ int mirror = 0;
+ struct raid1_info *p;
+ int first = 0;
+ int last = conf->raid_disks - 1;
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+ if (mddev->recovery_disabled == conf->recovery_disabled)
+ return -EBUSY;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
+
+ if (q->merge_bvec_fn) {
+ set_bit(Unmerged, &rdev->flags);
+ mddev->merge_check_needed = 1;
+ }
+
+ for (mirror = first; mirror <= last; mirror++) {
+ p = conf->mirrors+mirror;
+ if (!p->rdev) {
+
+ if (mddev->gendisk)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+
+ p->head_position = 0;
+ rdev->raid_disk = mirror;
+ err = 0;
+ /* As all devices are equivalent, we don't need a full recovery
+ * if this was recently any drive of the array
+ */
+ if (rdev->saved_raid_disk < 0)
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->rdev, rdev);
+ break;
+ }
+ if (test_bit(WantReplacement, &p->rdev->flags) &&
+ p[conf->raid_disks].rdev == NULL) {
+ /* Add this device as a replacement */
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = mirror;
+ err = 0;
+ conf->fullsync = 1;
+ rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
+ break;
+ }
+ }
+ if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+ /* Some requests might not have seen this new
+ * merge_bvec_fn. We must wait for them to complete
+ * before merging the device fully.
+ * First we make sure any code which has tested
+ * our function has submitted the request, then
+ * we wait for all outstanding requests to complete.
+ */
+ synchronize_sched();
+ freeze_array(conf, 0);
+ unfreeze_array(conf);
+ clear_bit(Unmerged, &rdev->flags);
+ }
+ md_integrity_add_rdev(rdev, mddev);
+ if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ print_conf(conf);
+ return err;
+}
+
+static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct r1conf *conf = mddev->private;
+ int err = 0;
+ int number = rdev->raid_disk;
+ struct raid1_info *p = conf->mirrors + number;
+
+ if (rdev != p->rdev)
+ p = conf->mirrors + conf->raid_disks + number;
+
+ print_conf(conf);
+ if (rdev == p->rdev) {
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ err = -EBUSY;
+ goto abort;
+ }
+ /* Only remove non-faulty devices if recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->recovery_disabled != conf->recovery_disabled &&
+ mddev->degraded < conf->raid_disks) {
+ err = -EBUSY;
+ goto abort;
+ }
+ p->rdev = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ p->rdev = rdev;
+ goto abort;
+ } else if (conf->mirrors[conf->raid_disks + number].rdev) {
+ /* We just removed a device that is being replaced.
+ * Move down the replacement. We drain all IO before
+ * doing this to avoid confusion.
+ */
+ struct md_rdev *repl =
+ conf->mirrors[conf->raid_disks + number].rdev;
+ freeze_array(conf, 0);
+ clear_bit(Replacement, &repl->flags);
+ p->rdev = repl;
+ conf->mirrors[conf->raid_disks + number].rdev = NULL;
+ unfreeze_array(conf);
+ clear_bit(WantReplacement, &rdev->flags);
+ } else
+ clear_bit(WantReplacement, &rdev->flags);
+ err = md_integrity_register(mddev);
+ }
+abort:
+
+ print_conf(conf);
+ return err;
+}
+
+static void end_sync_read(struct bio *bio, int error)
+{
+ struct r1bio *r1_bio = bio->bi_private;
+
+ update_head_pos(r1_bio->read_disk, r1_bio);
+
+ /*
+ * we have read a block, now it needs to be re-written,
+ * or re-read if the read failed.
+ * We don't do much here, just schedule handling by raid1d
+ */
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ if (atomic_dec_and_test(&r1_bio->remaining))
+ reschedule_retry(r1_bio);
+}
+
+static void end_sync_write(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct r1bio *r1_bio = bio->bi_private;
+ struct mddev *mddev = r1_bio->mddev;
+ struct r1conf *conf = mddev->private;
+ int mirror=0;
+ sector_t first_bad;
+ int bad_sectors;
+
+ mirror = find_bio_disk(r1_bio, bio);
+
+ if (!uptodate) {
+ sector_t sync_blocks = 0;
+ sector_t s = r1_bio->sector;
+ long sectors_to_go = r1_bio->sectors;
+ /* make sure these bits doesn't get cleared. */
+ do {
+ bitmap_end_sync(mddev->bitmap, s,
+ &sync_blocks, 1);
+ s += sync_blocks;
+ sectors_to_go -= sync_blocks;
+ } while (sectors_to_go > 0);
+ set_bit(WriteErrorSeen,
+ &conf->mirrors[mirror].rdev->flags);
+ if (!test_and_set_bit(WantReplacement,
+ &conf->mirrors[mirror].rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ mddev->recovery);
+ set_bit(R1BIO_WriteError, &r1_bio->state);
+ } else if (is_badblock(conf->mirrors[mirror].rdev,
+ r1_bio->sector,
+ r1_bio->sectors,
+ &first_bad, &bad_sectors) &&
+ !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
+ r1_bio->sector,
+ r1_bio->sectors,
+ &first_bad, &bad_sectors)
+ )
+ set_bit(R1BIO_MadeGood, &r1_bio->state);
+
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ int s = r1_bio->sectors;
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
+ }
+}
+
+static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
+ int sectors, struct page *page, int rw)
+{
+ if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
+ /* success */
+ return 1;
+ if (rw == WRITE) {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement,
+ &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED, &
+ rdev->mddev->recovery);
+ }
+ /* need to record an error - either for the block or the device */
+ if (!rdev_set_badblocks(rdev, sector, sectors, 0))
+ md_error(rdev->mddev, rdev);
+ return 0;
+}
+
+static int fix_sync_read_error(struct r1bio *r1_bio)
+{
+ /* Try some synchronous reads of other devices to get
+ * good data, much like with normal read errors. Only
+ * read into the pages we already have so we don't
+ * need to re-issue the read request.
+ * We don't need to freeze the array, because being in an
+ * active sync request, there is no normal IO, and
+ * no overlapping syncs.
+ * We don't need to check is_badblock() again as we
+ * made sure that anything with a bad block in range
+ * will have bi_end_io clear.
+ */
+ struct mddev *mddev = r1_bio->mddev;
+ struct r1conf *conf = mddev->private;
+ struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+ sector_t sect = r1_bio->sector;
+ int sectors = r1_bio->sectors;
+ int idx = 0;
+
+ while(sectors) {
+ int s = sectors;
+ int d = r1_bio->read_disk;
+ int success = 0;
+ struct md_rdev *rdev;
+ int start;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+ do {
+ if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+ /* No rcu protection needed here devices
+ * can only be removed when no resync is
+ * active, and resync is currently active
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev, sect, s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false)) {
+ success = 1;
+ break;
+ }
+ }
+ d++;
+ if (d == conf->raid_disks * 2)
+ d = 0;
+ } while (!success && d != r1_bio->read_disk);
+
+ if (!success) {
+ char b[BDEVNAME_SIZE];
+ int abort = 0;
+ /* Cannot read from anywhere, this block is lost.
+ * Record a bad block on each device. If that doesn't
+ * work just disable and interrupt the recovery.
+ * Don't fail devices as that won't really help.
+ */
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
+ " for block %llu\n",
+ mdname(mddev),
+ bdevname(bio->bi_bdev, b),
+ (unsigned long long)r1_bio->sector);
+ for (d = 0; d < conf->raid_disks * 2; d++) {
+ rdev = conf->mirrors[d].rdev;
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ continue;
+ if (!rdev_set_badblocks(rdev, sect, s, 0))
+ abort = 1;
+ }
+ if (abort) {
+ conf->recovery_disabled =
+ mddev->recovery_disabled;
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_done_sync(mddev, r1_bio->sectors, 0);
+ put_buf(r1_bio);
+ return 0;
+ }
+ /* Try next page */
+ sectors -= s;
+ sect += s;
+ idx++;
+ continue;
+ }
+
+ start = d;
+ /* write it back and re-read */
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks * 2;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (r1_sync_page_io(rdev, sect, s,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE) == 0) {
+ r1_bio->bios[d]->bi_end_io = NULL;
+ rdev_dec_pending(rdev, mddev);
+ }
+ }
+ d = start;
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks * 2;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (r1_sync_page_io(rdev, sect, s,
+ bio->bi_io_vec[idx].bv_page,
+ READ) != 0)
+ atomic_add(s, &rdev->corrected_errors);
+ }
+ sectors -= s;
+ sect += s;
+ idx ++;
+ }
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ return 1;
+}
+
+static void process_checks(struct r1bio *r1_bio)
+{
+ /* We have read all readable devices. If we haven't
+ * got the block, then there is no hope left.
+ * If we have, then we want to do a comparison
+ * and skip the write if everything is the same.
+ * If any blocks failed to read, then we need to
+ * attempt an over-write
+ */
+ struct mddev *mddev = r1_bio->mddev;
+ struct r1conf *conf = mddev->private;
+ int primary;
+ int i;
+ int vcnt;
+
+ /* Fix variable parts of all bios */
+ vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+ int j;
+ int size;
+ int uptodate;
+ struct bio *b = r1_bio->bios[i];
+ if (b->bi_end_io != end_sync_read)
+ continue;
+ /* fixup the bio for reuse, but preserve BIO_UPTODATE */
+ uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
+ bio_reset(b);
+ if (!uptodate)
+ clear_bit(BIO_UPTODATE, &b->bi_flags);
+ b->bi_vcnt = vcnt;
+ b->bi_iter.bi_size = r1_bio->sectors << 9;
+ b->bi_iter.bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ b->bi_bdev = conf->mirrors[i].rdev->bdev;
+ b->bi_end_io = end_sync_read;
+ b->bi_private = r1_bio;
+
+ size = b->bi_iter.bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &b->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ }
+ }
+ for (primary = 0; primary < conf->raid_disks * 2; primary++)
+ if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
+ test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+ r1_bio->bios[primary]->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
+ break;
+ }
+ r1_bio->read_disk = primary;
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+ int j;
+ struct bio *pbio = r1_bio->bios[primary];
+ struct bio *sbio = r1_bio->bios[i];
+ int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
+
+ if (sbio->bi_end_io != end_sync_read)
+ continue;
+ /* Now we can 'fixup' the BIO_UPTODATE flag */
+ set_bit(BIO_UPTODATE, &sbio->bi_flags);
+
+ if (uptodate) {
+ for (j = vcnt; j-- ; ) {
+ struct page *p, *s;
+ p = pbio->bi_io_vec[j].bv_page;
+ s = sbio->bi_io_vec[j].bv_page;
+ if (memcmp(page_address(p),
+ page_address(s),
+ sbio->bi_io_vec[j].bv_len))
+ break;
+ }
+ } else
+ j = 0;
+ if (j >= 0)
+ atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
+ if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
+ && uptodate)) {
+ /* No need to write to this device. */
+ sbio->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+ continue;
+ }
+
+ bio_copy_data(sbio, pbio);
+ }
+}
+
+static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
+{
+ struct r1conf *conf = mddev->private;
+ int i;
+ int disks = conf->raid_disks * 2;
+ struct bio *bio, *wbio;
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+ /* ouch - failed to read all of that. */
+ if (!fix_sync_read_error(r1_bio))
+ return;
+
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ process_checks(r1_bio);
+
+ /*
+ * schedule writes
+ */
+ atomic_set(&r1_bio->remaining, 1);
+ for (i = 0; i < disks ; i++) {
+ wbio = r1_bio->bios[i];
+ if (wbio->bi_end_io == NULL ||
+ (wbio->bi_end_io == end_sync_read &&
+ (i == r1_bio->read_disk ||
+ !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
+ continue;
+
+ wbio->bi_rw = WRITE;
+ wbio->bi_end_io = end_sync_write;
+ atomic_inc(&r1_bio->remaining);
+ md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
+
+ generic_make_request(wbio);
+ }
+
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ /* if we're here, all write(s) have completed, so clean up */
+ int s = r1_bio->sectors;
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, 1);
+ }
+ }
+}
+
+/*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working mirrors.
+ * 2. Updates the raid superblock when problems encounter.
+ * 3. Performs writes following reads for array synchronising.
+ */
+
+static void fix_read_error(struct r1conf *conf, int read_disk,
+ sector_t sect, int sectors)
+{
+ struct mddev *mddev = conf->mddev;
+ while(sectors) {
+ int s = sectors;
+ int d = read_disk;
+ int success = 0;
+ int start;
+ struct md_rdev *rdev;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ do {
+ /* Note: no rcu protection needed here
+ * as this is synchronous in the raid1d thread
+ * which is the thread that might remove
+ * a device. If raid1d ever becomes multi-threaded....
+ */
+ sector_t first_bad;
+ int bad_sectors;
+
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ (test_bit(In_sync, &rdev->flags) ||
+ (!test_bit(Faulty, &rdev->flags) &&
+ rdev->recovery_offset >= sect + s)) &&
+ is_badblock(rdev, sect, s,
+ &first_bad, &bad_sectors) == 0 &&
+ sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false))
+ success = 1;
+ else {
+ d++;
+ if (d == conf->raid_disks * 2)
+ d = 0;
+ }
+ } while (!success && d != read_disk);
+
+ if (!success) {
+ /* Cannot read from anywhere - mark it bad */
+ struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
+ if (!rdev_set_badblocks(rdev, sect, s, 0))
+ md_error(mddev, rdev);
+ break;
+ }
+ /* write it back and re-read */
+ start = d;
+ while (d != read_disk) {
+ if (d==0)
+ d = conf->raid_disks * 2;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ !test_bit(Faulty, &rdev->flags))
+ r1_sync_page_io(rdev, sect, s,
+ conf->tmppage, WRITE);
+ }
+ d = start;
+ while (d != read_disk) {
+ char b[BDEVNAME_SIZE];
+ if (d==0)
+ d = conf->raid_disks * 2;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ !test_bit(Faulty, &rdev->flags)) {
+ if (r1_sync_page_io(rdev, sect, s,
+ conf->tmppage, READ)) {
+ atomic_add(s, &rdev->corrected_errors);
+ printk(KERN_INFO
+ "md/raid1:%s: read error corrected "
+ "(%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(sect +
+ rdev->data_offset),
+ bdevname(rdev->bdev, b));
+ }
+ }
+ }
+ sectors -= s;
+ sect += s;
+ }
+}
+
+static int narrow_write_error(struct r1bio *r1_bio, int i)
+{
+ struct mddev *mddev = r1_bio->mddev;
+ struct r1conf *conf = mddev->private;
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+ /* bio has the data to be written to device 'i' where
+ * we just recently had a write error.
+ * We repeatedly clone the bio and trim down to one block,
+ * then try the write. Where the write fails we record
+ * a bad block.
+ * It is conceivable that the bio doesn't exactly align with
+ * blocks. We must handle this somehow.
+ *
+ * We currently own a reference on the rdev.
+ */
+
+ int block_sectors;
+ sector_t sector;
+ int sectors;
+ int sect_to_write = r1_bio->sectors;
+ int ok = 1;
+
+ if (rdev->badblocks.shift < 0)
+ return 0;
+
+ block_sectors = roundup(1 << rdev->badblocks.shift,
+ bdev_logical_block_size(rdev->bdev) >> 9);
+ sector = r1_bio->sector;
+ sectors = ((sector + block_sectors)
+ & ~(sector_t)(block_sectors - 1))
+ - sector;
+
+ while (sect_to_write) {
+ struct bio *wbio;
+ if (sectors > sect_to_write)
+ sectors = sect_to_write;
+ /* Write at 'sector' for 'sectors'*/
+
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ unsigned vcnt = r1_bio->behind_page_count;
+ struct bio_vec *vec = r1_bio->behind_bvecs;
+
+ while (!vec->bv_page) {
+ vec++;
+ vcnt--;
+ }
+
+ wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+ memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+
+ wbio->bi_vcnt = vcnt;
+ } else {
+ wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+ }
+
+ wbio->bi_rw = WRITE;
+ wbio->bi_iter.bi_sector = r1_bio->sector;
+ wbio->bi_iter.bi_size = r1_bio->sectors << 9;
+
+ bio_trim(wbio, sector - r1_bio->sector, sectors);
+ wbio->bi_iter.bi_sector += rdev->data_offset;
+ wbio->bi_bdev = rdev->bdev;
+ if (submit_bio_wait(WRITE, wbio) == 0)
+ /* failure! */
+ ok = rdev_set_badblocks(rdev, sector,
+ sectors, 0)
+ && ok;
+
+ bio_put(wbio);
+ sect_to_write -= sectors;
+ sector += sectors;
+ sectors = block_sectors;
+ }
+ return ok;
+}
+
+static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
+{
+ int m;
+ int s = r1_bio->sectors;
+ for (m = 0; m < conf->raid_disks * 2 ; m++) {
+ struct md_rdev *rdev = conf->mirrors[m].rdev;
+ struct bio *bio = r1_bio->bios[m];
+ if (bio->bi_end_io == NULL)
+ continue;
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ test_bit(R1BIO_MadeGood, &r1_bio->state)) {
+ rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
+ }
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ test_bit(R1BIO_WriteError, &r1_bio->state)) {
+ if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
+ md_error(conf->mddev, rdev);
+ }
+ }
+ put_buf(r1_bio);
+ md_done_sync(conf->mddev, s, 1);
+}
+
+static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
+{
+ int m;
+ for (m = 0; m < conf->raid_disks * 2 ; m++)
+ if (r1_bio->bios[m] == IO_MADE_GOOD) {
+ struct md_rdev *rdev = conf->mirrors[m].rdev;
+ rdev_clear_badblocks(rdev,
+ r1_bio->sector,
+ r1_bio->sectors, 0);
+ rdev_dec_pending(rdev, conf->mddev);
+ } else if (r1_bio->bios[m] != NULL) {
+ /* This drive got a write error. We need to
+ * narrow down and record precise write
+ * errors.
+ */
+ if (!narrow_write_error(r1_bio, m)) {
+ md_error(conf->mddev,
+ conf->mirrors[m].rdev);
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ }
+ rdev_dec_pending(conf->mirrors[m].rdev,
+ conf->mddev);
+ }
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ close_write(r1_bio);
+ raid_end_bio_io(r1_bio);
+}
+
+static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
+{
+ int disk;
+ int max_sectors;
+ struct mddev *mddev = conf->mddev;
+ struct bio *bio;
+ char b[BDEVNAME_SIZE];
+ struct md_rdev *rdev;
+
+ clear_bit(R1BIO_ReadError, &r1_bio->state);
+ /* we got a read error. Maybe the drive is bad. Maybe just
+ * the block and we can fix it.
+ * We freeze all other IO, and try reading the block from
+ * other devices. When we find one, we re-write
+ * and check it that fixes the read error.
+ * This is all done synchronously while the array is
+ * frozen
+ */
+ if (mddev->ro == 0) {
+ freeze_array(conf, 1);
+ fix_read_error(conf, r1_bio->read_disk,
+ r1_bio->sector, r1_bio->sectors);
+ unfreeze_array(conf);
+ } else
+ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+ rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+ bdevname(bio->bi_bdev, b);
+read_more:
+ disk = read_balance(conf, r1_bio, &max_sectors);
+ if (disk == -1) {
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
+ " read error for block %llu\n",
+ mdname(mddev), b, (unsigned long long)r1_bio->sector);
+ raid_end_bio_io(r1_bio);
+ } else {
+ const unsigned long do_sync
+ = r1_bio->master_bio->bi_rw & REQ_SYNC;
+ if (bio) {
+ r1_bio->bios[r1_bio->read_disk] =
+ mddev->ro ? IO_BLOCKED : NULL;
+ bio_put(bio);
+ }
+ r1_bio->read_disk = disk;
+ bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+ bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+ r1_bio->bios[r1_bio->read_disk] = bio;
+ rdev = conf->mirrors[disk].rdev;
+ printk_ratelimited(KERN_ERR
+ "md/raid1:%s: redirecting sector %llu"
+ " to other mirror: %s\n",
+ mdname(mddev),
+ (unsigned long long)r1_bio->sector,
+ bdevname(rdev->bdev, b));
+ bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_end_io = raid1_end_read_request;
+ bio->bi_rw = READ | do_sync;
+ bio->bi_private = r1_bio;
+ if (max_sectors < r1_bio->sectors) {
+ /* Drat - have to split this up more */
+ struct bio *mbio = r1_bio->master_bio;
+ int sectors_handled = (r1_bio->sector + max_sectors
+ - mbio->bi_iter.bi_sector);
+ r1_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (mbio->bi_phys_segments == 0)
+ mbio->bi_phys_segments = 2;
+ else
+ mbio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ generic_make_request(bio);
+ bio = NULL;
+
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = mbio;
+ r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
+ r1_bio->state = 0;
+ set_bit(R1BIO_ReadError, &r1_bio->state);
+ r1_bio->mddev = mddev;
+ r1_bio->sector = mbio->bi_iter.bi_sector +
+ sectors_handled;
+
+ goto read_more;
+ } else
+ generic_make_request(bio);
+ }
+}
+
+static void raid1d(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct r1bio *r1_bio;
+ unsigned long flags;
+ struct r1conf *conf = mddev->private;
+ struct list_head *head = &conf->retry_list;
+ struct blk_plug plug;
+
+ md_check_recovery(mddev);
+
+ blk_start_plug(&plug);
+ for (;;) {
+
+ flush_pending_writes(conf);
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (list_empty(head)) {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ break;
+ }
+ r1_bio = list_entry(head->prev, struct r1bio, retry_list);
+ list_del(head->prev);
+ conf->nr_queued--;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ mddev = r1_bio->mddev;
+ conf = mddev->private;
+ if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ handle_sync_write_finished(conf, r1_bio);
+ else
+ sync_request_write(mddev, r1_bio);
+ } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ handle_write_finished(conf, r1_bio);
+ else if (test_bit(R1BIO_ReadError, &r1_bio->state))
+ handle_read_error(conf, r1_bio);
+ else
+ /* just a partial read to be scheduled from separate
+ * context
+ */
+ generic_make_request(r1_bio->bios[r1_bio->read_disk]);
+
+ cond_resched();
+ if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+ md_check_recovery(mddev);
+ }
+ blk_finish_plug(&plug);
+}
+
+static int init_resync(struct r1conf *conf)
+{
+ int buffs;
+
+ buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
+ BUG_ON(conf->r1buf_pool);
+ conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
+ conf->poolinfo);
+ if (!conf->r1buf_pool)
+ return -ENOMEM;
+ conf->next_resync = 0;
+ return 0;
+}
+
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ *
+ * This is achieved by tracking pending requests and a 'barrier' concept
+ * that can be installed to exclude normal IO requests.
+ */
+
+static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
+{
+ struct r1conf *conf = mddev->private;
+ struct r1bio *r1_bio;
+ struct bio *bio;
+ sector_t max_sector, nr_sectors;
+ int disk = -1;
+ int i;
+ int wonly = -1;
+ int write_targets = 0, read_targets = 0;
+ sector_t sync_blocks;
+ int still_degraded = 0;
+ int good_sectors = RESYNC_SECTORS;
+ int min_bad = 0; /* number of sectors that are bad in all devices */
+
+ if (!conf->r1buf_pool)
+ if (init_resync(conf))
+ return 0;
+
+ max_sector = mddev->dev_sectors;
+ if (sector_nr >= max_sector) {
+ /* If we aborted, we need to abort the
+ * sync on the 'current' bitmap chunk (there will
+ * only be one in raid1 resync.
+ * We can find the current addess in mddev->curr_resync
+ */
+ if (mddev->curr_resync < max_sector) /* aborted */
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else /* completed sync */
+ conf->fullsync = 0;
+
+ bitmap_close_sync(mddev->bitmap);
+ close_sync(conf);
+ return 0;
+ }
+
+ if (mddev->bitmap == NULL &&
+ mddev->recovery_cp == MaxSector &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ conf->fullsync == 0) {
+ *skipped = 1;
+ return max_sector - sector_nr;
+ }
+ /* before building a request, check if we can skip these blocks..
+ * This call the bitmap_start_sync doesn't actually record anything
+ */
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* We can skip this block, and probably several more */
+ *skipped = 1;
+ return sync_blocks;
+ }
+
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+
+ raise_barrier(conf, sector_nr);
+
+ rcu_read_lock();
+ /*
+ * If we get a correctably read error during resync or recovery,
+ * we might want to read from a different device. So we
+ * flag all drives that could conceivably be read from for READ,
+ * and any others (which will be non-In_sync devices) for WRITE.
+ * If a read fails, we try reading from something else for which READ
+ * is OK.
+ */
+
+ r1_bio->mddev = mddev;
+ r1_bio->sector = sector_nr;
+ r1_bio->state = 0;
+ set_bit(R1BIO_IsSync, &r1_bio->state);
+
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+ struct md_rdev *rdev;
+ bio = r1_bio->bios[i];
+ bio_reset(bio);
+
+ rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev == NULL ||
+ test_bit(Faulty, &rdev->flags)) {
+ if (i < conf->raid_disks)
+ still_degraded = 1;
+ } else if (!test_bit(In_sync, &rdev->flags)) {
+ bio->bi_rw = WRITE;
+ bio->bi_end_io = end_sync_write;
+ write_targets ++;
+ } else {
+ /* may need to read from here */
+ sector_t first_bad = MaxSector;
+ int bad_sectors;
+
+ if (is_badblock(rdev, sector_nr, good_sectors,
+ &first_bad, &bad_sectors)) {
+ if (first_bad > sector_nr)
+ good_sectors = first_bad - sector_nr;
+ else {
+ bad_sectors -= (sector_nr - first_bad);
+ if (min_bad == 0 ||
+ min_bad > bad_sectors)
+ min_bad = bad_sectors;
+ }
+ }
+ if (sector_nr < first_bad) {
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ if (wonly < 0)
+ wonly = i;
+ } else {
+ if (disk < 0)
+ disk = i;
+ }
+ bio->bi_rw = READ;
+ bio->bi_end_io = end_sync_read;
+ read_targets++;
+ } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+ /*
+ * The device is suitable for reading (InSync),
+ * but has bad block(s) here. Let's try to correct them,
+ * if we are doing resync or repair. Otherwise, leave
+ * this device alone for this sync request.
+ */
+ bio->bi_rw = WRITE;
+ bio->bi_end_io = end_sync_write;
+ write_targets++;
+ }
+ }
+ if (bio->bi_end_io) {
+ atomic_inc(&rdev->nr_pending);
+ bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_private = r1_bio;
+ }
+ }
+ rcu_read_unlock();
+ if (disk < 0)
+ disk = wonly;
+ r1_bio->read_disk = disk;
+
+ if (read_targets == 0 && min_bad > 0) {
+ /* These sectors are bad on all InSync devices, so we
+ * need to mark them bad on all write targets
+ */
+ int ok = 1;
+ for (i = 0 ; i < conf->raid_disks * 2 ; i++)
+ if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+ ok = rdev_set_badblocks(rdev, sector_nr,
+ min_bad, 0
+ ) && ok;
+ }
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ *skipped = 1;
+ put_buf(r1_bio);
+
+ if (!ok) {
+ /* Cannot record the badblocks, so need to
+ * abort the resync.
+ * If there are multiple read targets, could just
+ * fail the really bad ones ???
+ */
+ conf->recovery_disabled = mddev->recovery_disabled;
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ return 0;
+ } else
+ return min_bad;
+
+ }
+ if (min_bad > 0 && min_bad < good_sectors) {
+ /* only resync enough to reach the next bad->good
+ * transition */
+ good_sectors = min_bad;
+ }
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
+ /* extra read targets are also write targets */
+ write_targets += read_targets-1;
+
+ if (write_targets == 0 || read_targets == 0) {
+ /* There is nowhere to write, so all non-sync
+ * drives must be failed - so we are finished
+ */
+ sector_t rv;
+ if (min_bad > 0)
+ max_sector = sector_nr + min_bad;
+ rv = max_sector - sector_nr;
+ *skipped = 1;
+ put_buf(r1_bio);
+ return rv;
+ }
+
+ if (max_sector > mddev->resync_max)
+ max_sector = mddev->resync_max; /* Don't do IO beyond here */
+ if (max_sector > sector_nr + good_sectors)
+ max_sector = sector_nr + good_sectors;
+ nr_sectors = 0;
+ sync_blocks = 0;
+ do {
+ struct page *page;
+ int len = PAGE_SIZE;
+ if (sector_nr + (len>>9) > max_sector)
+ len = (max_sector - sector_nr) << 9;
+ if (len == 0)
+ break;
+ if (sync_blocks == 0) {
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr,
+ &sync_blocks, still_degraded) &&
+ !conf->fullsync &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ break;
+ BUG_ON(sync_blocks < (PAGE_SIZE>>9));
+ if ((len >> 9) > sync_blocks)
+ len = sync_blocks<<9;
+ }
+
+ for (i = 0 ; i < conf->raid_disks * 2; i++) {
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io) {
+ page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+ if (bio_add_page(bio, page, len, 0) == 0) {
+ /* stop here */
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+ while (i > 0) {
+ i--;
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io==NULL)
+ continue;
+ /* remove last page from this bio */
+ bio->bi_vcnt--;
+ bio->bi_iter.bi_size -= len;
+ __clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+ }
+ goto bio_full;
+ }
+ }
+ }
+ nr_sectors += len>>9;
+ sector_nr += len>>9;
+ sync_blocks -= (len>>9);
+ } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+ bio_full:
+ r1_bio->sectors = nr_sectors;
+
+ /* For a user-requested sync, we read all readable devices and do a
+ * compare
+ */
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ atomic_set(&r1_bio->remaining, read_targets);
+ for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io == end_sync_read) {
+ read_targets--;
+ md_sync_acct(bio->bi_bdev, nr_sectors);
+ generic_make_request(bio);
+ }
+ }
+ } else {
+ atomic_set(&r1_bio->remaining, 1);
+ bio = r1_bio->bios[r1_bio->read_disk];
+ md_sync_acct(bio->bi_bdev, nr_sectors);
+ generic_make_request(bio);
+
+ }
+ return nr_sectors;
+}
+
+static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ if (sectors)
+ return sectors;
+
+ return mddev->dev_sectors;
+}
+
+static struct r1conf *setup_conf(struct mddev *mddev)
+{
+ struct r1conf *conf;
+ int i;
+ struct raid1_info *disk;
+ struct md_rdev *rdev;
+ int err = -ENOMEM;
+
+ conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
+ if (!conf)
+ goto abort;
+
+ conf->mirrors = kzalloc(sizeof(struct raid1_info)
+ * mddev->raid_disks * 2,
+ GFP_KERNEL);
+ if (!conf->mirrors)
+ goto abort;
+
+ conf->tmppage = alloc_page(GFP_KERNEL);
+ if (!conf->tmppage)
+ goto abort;
+
+ conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
+ if (!conf->poolinfo)
+ goto abort;
+ conf->poolinfo->raid_disks = mddev->raid_disks * 2;
+ conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free,
+ conf->poolinfo);
+ if (!conf->r1bio_pool)
+ goto abort;
+
+ conf->poolinfo->mddev = mddev;
+
+ err = -EINVAL;
+ spin_lock_init(&conf->device_lock);
+ rdev_for_each(rdev, mddev) {
+ struct request_queue *q;
+ int disk_idx = rdev->raid_disk;
+ if (disk_idx >= mddev->raid_disks
+ || disk_idx < 0)
+ continue;
+ if (test_bit(Replacement, &rdev->flags))
+ disk = conf->mirrors + mddev->raid_disks + disk_idx;
+ else
+ disk = conf->mirrors + disk_idx;
+
+ if (disk->rdev)
+ goto abort;
+ disk->rdev = rdev;
+ q = bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn)
+ mddev->merge_check_needed = 1;
+
+ disk->head_position = 0;
+ disk->seq_start = MaxSector;
+ }
+ conf->raid_disks = mddev->raid_disks;
+ conf->mddev = mddev;
+ INIT_LIST_HEAD(&conf->retry_list);
+
+ spin_lock_init(&conf->resync_lock);
+ init_waitqueue_head(&conf->wait_barrier);
+
+ bio_list_init(&conf->pending_bio_list);
+ conf->pending_count = 0;
+ conf->recovery_disabled = mddev->recovery_disabled - 1;
+
+ conf->start_next_window = MaxSector;
+ conf->current_window_requests = conf->next_window_requests = 0;
+
+ err = -EIO;
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+
+ disk = conf->mirrors + i;
+
+ if (i < conf->raid_disks &&
+ disk[conf->raid_disks].rdev) {
+ /* This slot has a replacement. */
+ if (!disk->rdev) {
+ /* No original, just make the replacement
+ * a recovering spare
+ */
+ disk->rdev =
+ disk[conf->raid_disks].rdev;
+ disk[conf->raid_disks].rdev = NULL;
+ } else if (!test_bit(In_sync, &disk->rdev->flags))
+ /* Original is not in_sync - bad */
+ goto abort;
+ }
+
+ if (!disk->rdev ||
+ !test_bit(In_sync, &disk->rdev->flags)) {
+ disk->head_position = 0;
+ if (disk->rdev &&
+ (disk->rdev->saved_raid_disk < 0))
+ conf->fullsync = 1;
+ }
+ }
+
+ err = -ENOMEM;
+ conf->thread = md_register_thread(raid1d, mddev, "raid1");
+ if (!conf->thread) {
+ printk(KERN_ERR
+ "md/raid1:%s: couldn't allocate thread\n",
+ mdname(mddev));
+ goto abort;
+ }
+
+ return conf;
+
+ abort:
+ if (conf) {
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ kfree(conf->mirrors);
+ safe_put_page(conf->tmppage);
+ kfree(conf->poolinfo);
+ kfree(conf);
+ }
+ return ERR_PTR(err);
+}
+
+static void raid1_free(struct mddev *mddev, void *priv);
+static int run(struct mddev *mddev)
+{
+ struct r1conf *conf;
+ int i;
+ struct md_rdev *rdev;
+ int ret;
+ bool discard_supported = false;
+
+ if (mddev->level != 1) {
+ printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
+ mdname(mddev), mddev->level);
+ return -EIO;
+ }
+ if (mddev->reshape_position != MaxSector) {
+ printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
+ mdname(mddev));
+ return -EIO;
+ }
+ /*
+ * copy the already verified devices into our private RAID1
+ * bookkeeping area. [whatever we allocate in run(),
+ * should be freed in raid1_free()]
+ */
+ if (mddev->private == NULL)
+ conf = setup_conf(mddev);
+ else
+ conf = mddev->private;
+
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+
+ if (mddev->queue)
+ blk_queue_max_write_same_sectors(mddev->queue, 0);
+
+ rdev_for_each(rdev, mddev) {
+ if (!mddev->gendisk)
+ continue;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ discard_supported = true;
+ }
+
+ mddev->degraded = 0;
+ for (i=0; i < conf->raid_disks; i++)
+ if (conf->mirrors[i].rdev == NULL ||
+ !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
+ test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ mddev->degraded++;
+
+ if (conf->raid_disks - mddev->degraded == 1)
+ mddev->recovery_cp = MaxSector;
+
+ if (mddev->recovery_cp != MaxSector)
+ printk(KERN_NOTICE "md/raid1:%s: not clean"
+ " -- starting background reconstruction\n",
+ mdname(mddev));
+ printk(KERN_INFO
+ "md/raid1:%s: active with %d out of %d mirrors\n",
+ mdname(mddev), mddev->raid_disks - mddev->degraded,
+ mddev->raid_disks);
+
+ /*
+ * Ok, everything is just fine now
+ */
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+ mddev->private = conf;
+
+ md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
+
+ if (mddev->queue) {
+ if (discard_supported)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ }
+
+ ret = md_integrity_register(mddev);
+ if (ret) {
+ md_unregister_thread(&mddev->thread);
+ raid1_free(mddev, conf);
+ }
+ return ret;
+}
+
+static void raid1_free(struct mddev *mddev, void *priv)
+{
+ struct r1conf *conf = priv;
+
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ kfree(conf->mirrors);
+ safe_put_page(conf->tmppage);
+ kfree(conf->poolinfo);
+ kfree(conf);
+}
+
+static int raid1_resize(struct mddev *mddev, sector_t sectors)
+{
+ /* no resync is happening, and there is enough space
+ * on all devices, so we can resize.
+ * We need to make sure resync covers any new space.
+ * If the array is shrinking we should possibly wait until
+ * any io in the removed space completes, but it hardly seems
+ * worth it.
+ */
+ sector_t newsize = raid1_size(mddev, sectors, 0);
+ if (mddev->external_size &&
+ mddev->array_sectors > newsize)
+ return -EINVAL;
+ if (mddev->bitmap) {
+ int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ if (ret)
+ return ret;
+ }
+ md_set_array_sectors(mddev, newsize);
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ if (sectors > mddev->dev_sectors &&
+ mddev->recovery_cp > mddev->dev_sectors) {
+ mddev->recovery_cp = mddev->dev_sectors;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ mddev->dev_sectors = sectors;
+ mddev->resync_max_sectors = sectors;
+ return 0;
+}
+
+static int raid1_reshape(struct mddev *mddev)
+{
+ /* We need to:
+ * 1/ resize the r1bio_pool
+ * 2/ resize conf->mirrors
+ *
+ * We allocate a new r1bio_pool if we can.
+ * Then raise a device barrier and wait until all IO stops.
+ * Then resize conf->mirrors and swap in the new r1bio pool.
+ *
+ * At the same time, we "pack" the devices so that all the missing
+ * devices have the higher raid_disk numbers.
+ */
+ mempool_t *newpool, *oldpool;
+ struct pool_info *newpoolinfo;
+ struct raid1_info *newmirrors;
+ struct r1conf *conf = mddev->private;
+ int cnt, raid_disks;
+ unsigned long flags;
+ int d, d2, err;
+
+ /* Cannot change chunk_size, layout, or level */
+ if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
+ mddev->layout != mddev->new_layout ||
+ mddev->level != mddev->new_level) {
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->new_layout = mddev->layout;
+ mddev->new_level = mddev->level;
+ return -EINVAL;
+ }
+
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
+
+ raid_disks = mddev->raid_disks + mddev->delta_disks;
+
+ if (raid_disks < conf->raid_disks) {
+ cnt=0;
+ for (d= 0; d < conf->raid_disks; d++)
+ if (conf->mirrors[d].rdev)
+ cnt++;
+ if (cnt > raid_disks)
+ return -EBUSY;
+ }
+
+ newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
+ if (!newpoolinfo)
+ return -ENOMEM;
+ newpoolinfo->mddev = mddev;
+ newpoolinfo->raid_disks = raid_disks * 2;
+
+ newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free, newpoolinfo);
+ if (!newpool) {
+ kfree(newpoolinfo);
+ return -ENOMEM;
+ }
+ newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
+ GFP_KERNEL);
+ if (!newmirrors) {
+ kfree(newpoolinfo);
+ mempool_destroy(newpool);
+ return -ENOMEM;
+ }
+
+ freeze_array(conf, 0);
+
+ /* ok, everything is stopped */
+ oldpool = conf->r1bio_pool;
+ conf->r1bio_pool = newpool;
+
+ for (d = d2 = 0; d < conf->raid_disks; d++) {
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
+ if (rdev && rdev->raid_disk != d2) {
+ sysfs_unlink_rdev(mddev, rdev);
+ rdev->raid_disk = d2;
+ sysfs_unlink_rdev(mddev, rdev);
+ if (sysfs_link_rdev(mddev, rdev))
+ printk(KERN_WARNING
+ "md/raid1:%s: cannot register rd%d\n",
+ mdname(mddev), rdev->raid_disk);
+ }
+ if (rdev)
+ newmirrors[d2++].rdev = rdev;
+ }
+ kfree(conf->mirrors);
+ conf->mirrors = newmirrors;
+ kfree(conf->poolinfo);
+ conf->poolinfo = newpoolinfo;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded += (raid_disks - conf->raid_disks);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ conf->raid_disks = mddev->raid_disks = raid_disks;
+ mddev->delta_disks = 0;
+
+ unfreeze_array(conf);
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ mempool_destroy(oldpool);
+ return 0;
+}
+
+static void raid1_quiesce(struct mddev *mddev, int state)
+{
+ struct r1conf *conf = mddev->private;
+
+ switch(state) {
+ case 2: /* wake for suspend */
+ wake_up(&conf->wait_barrier);
+ break;
+ case 1:
+ freeze_array(conf, 0);
+ break;
+ case 0:
+ unfreeze_array(conf);
+ break;
+ }
+}
+
+static void *raid1_takeover(struct mddev *mddev)
+{
+ /* raid1 can take over:
+ * raid5 with 2 devices, any layout or chunk size
+ */
+ if (mddev->level == 5 && mddev->raid_disks == 2) {
+ struct r1conf *conf;
+ mddev->new_level = 1;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = 0;
+ conf = setup_conf(mddev);
+ if (!IS_ERR(conf))
+ /* Array must appear to be quiesced */
+ conf->array_frozen = 1;
+ return conf;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct md_personality raid1_personality =
+{
+ .name = "raid1",
+ .level = 1,
+ .owner = THIS_MODULE,
+ .make_request = make_request,
+ .run = run,
+ .free = raid1_free,
+ .status = status,
+ .error_handler = error,
+ .hot_add_disk = raid1_add_disk,
+ .hot_remove_disk= raid1_remove_disk,
+ .spare_active = raid1_spare_active,
+ .sync_request = sync_request,
+ .resize = raid1_resize,
+ .size = raid1_size,
+ .check_reshape = raid1_reshape,
+ .quiesce = raid1_quiesce,
+ .takeover = raid1_takeover,
+ .congested = raid1_congested,
+ .mergeable_bvec = raid1_mergeable_bvec,
+};
+
+static int __init raid_init(void)
+{
+ return register_md_personality(&raid1_personality);
+}
+
+static void raid_exit(void)
+{
+ unregister_md_personality(&raid1_personality);
+}
+
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
+MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-raid1");
+MODULE_ALIAS("md-level-1");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
new file mode 100644
index 000000000..14ebb288c
--- /dev/null
+++ b/drivers/md/raid1.h
@@ -0,0 +1,173 @@
+#ifndef _RAID1_H
+#define _RAID1_H
+
+struct raid1_info {
+ struct md_rdev *rdev;
+ sector_t head_position;
+
+ /* When choose the best device for a read (read_balance())
+ * we try to keep sequential reads one the same device
+ */
+ sector_t next_seq_sect;
+ sector_t seq_start;
+};
+
+/*
+ * memory pools need a pointer to the mddev, so they can force an unplug
+ * when memory is tight, and a count of the number of drives that the
+ * pool was allocated for, so they know how much to allocate and free.
+ * mddev->raid_disks cannot be used, as it can change while a pool is active
+ * These two datums are stored in a kmalloced struct.
+ * The 'raid_disks' here is twice the raid_disks in r1conf.
+ * This allows space for each 'real' device can have a replacement in the
+ * second half of the array.
+ */
+
+struct pool_info {
+ struct mddev *mddev;
+ int raid_disks;
+};
+
+struct r1conf {
+ struct mddev *mddev;
+ struct raid1_info *mirrors; /* twice 'raid_disks' to
+ * allow for replacements.
+ */
+ int raid_disks;
+
+ /* During resync, read_balancing is only allowed on the part
+ * of the array that has been resynced. 'next_resync' tells us
+ * where that is.
+ */
+ sector_t next_resync;
+
+ /* When raid1 starts resync, we divide array into four partitions
+ * |---------|--------------|---------------------|-------------|
+ * next_resync start_next_window end_window
+ * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
+ * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
+ * current_window_requests means the count of normalIO between
+ * start_next_window and end_window.
+ * next_window_requests means the count of normalIO after end_window.
+ * */
+ sector_t start_next_window;
+ int current_window_requests;
+ int next_window_requests;
+
+ spinlock_t device_lock;
+
+ /* list of 'struct r1bio' that need to be processed by raid1d,
+ * whether to retry a read, writeout a resync or recovery
+ * block, or anything else.
+ */
+ struct list_head retry_list;
+
+ /* queue pending writes to be submitted on unplug */
+ struct bio_list pending_bio_list;
+ int pending_count;
+
+ /* for use when syncing mirrors:
+ * We don't allow both normal IO and resync/recovery IO at
+ * the same time - resync/recovery can only happen when there
+ * is no other IO. So when either is active, the other has to wait.
+ * See more details description in raid1.c near raise_barrier().
+ */
+ wait_queue_head_t wait_barrier;
+ spinlock_t resync_lock;
+ int nr_pending;
+ int nr_waiting;
+ int nr_queued;
+ int barrier;
+ int array_frozen;
+
+ /* Set to 1 if a full sync is needed, (fresh device added).
+ * Cleared when a sync completes.
+ */
+ int fullsync;
+
+ /* When the same as mddev->recovery_disabled we don't allow
+ * recovery to be attempted as we expect a read error.
+ */
+ int recovery_disabled;
+
+ /* poolinfo contains information about the content of the
+ * mempools - it changes when the array grows or shrinks
+ */
+ struct pool_info *poolinfo;
+ mempool_t *r1bio_pool;
+ mempool_t *r1buf_pool;
+
+ /* temporary buffer to synchronous IO when attempting to repair
+ * a read error.
+ */
+ struct page *tmppage;
+
+ /* When taking over an array from a different personality, we store
+ * the new thread here until we fully activate the array.
+ */
+ struct md_thread *thread;
+};
+
+/*
+ * this is our 'private' RAID1 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID1 operation, and about their status:
+ */
+
+struct r1bio {
+ atomic_t remaining; /* 'have we finished' count,
+ * used from IRQ handlers
+ */
+ atomic_t behind_remaining; /* number of write-behind ios remaining
+ * in this BehindIO request
+ */
+ sector_t sector;
+ sector_t start_next_window;
+ int sectors;
+ unsigned long state;
+ struct mddev *mddev;
+ /*
+ * original bio going to /dev/mdx
+ */
+ struct bio *master_bio;
+ /*
+ * if the IO is in READ direction, then this is where we read
+ */
+ int read_disk;
+
+ struct list_head retry_list;
+ /* Next two are only valid when R1BIO_BehindIO is set */
+ struct bio_vec *behind_bvecs;
+ int behind_page_count;
+ /*
+ * if the IO is in WRITE direction, then multiple bios are used.
+ * We choose the number when they are allocated.
+ */
+ struct bio *bios[0];
+ /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
+};
+
+/* bits for r1bio.state */
+#define R1BIO_Uptodate 0
+#define R1BIO_IsSync 1
+#define R1BIO_Degraded 2
+#define R1BIO_BehindIO 3
+/* Set ReadError on bios that experience a readerror so that
+ * raid1d knows what to do with them.
+ */
+#define R1BIO_ReadError 4
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful. Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define R1BIO_Returned 6
+/* If a write for this request means we can clear some
+ * known-bad-block records, we set this flag
+ */
+#define R1BIO_MadeGood 7
+#define R1BIO_WriteError 8
+#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
new file mode 100644
index 000000000..f55c3f35b
--- /dev/null
+++ b/drivers/md/raid10.c
@@ -0,0 +1,4731 @@
+/*
+ * raid10.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 2000-2004 Neil Brown
+ *
+ * RAID-10 support for md.
+ *
+ * Base on code in raid1.c. See raid1.c for further copyright information.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include "md.h"
+#include "raid10.h"
+#include "raid0.h"
+#include "bitmap.h"
+
+/*
+ * RAID10 provides a combination of RAID0 and RAID1 functionality.
+ * The layout of data is defined by
+ * chunk_size
+ * raid_disks
+ * near_copies (stored in low byte of layout)
+ * far_copies (stored in second byte of layout)
+ * far_offset (stored in bit 16 of layout )
+ * use_far_sets (stored in bit 17 of layout )
+ *
+ * The data to be stored is divided into chunks using chunksize. Each device
+ * is divided into far_copies sections. In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive). The starting device for each section is offset
+ * near_copies from the starting device of the previous section. Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive. near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
+ *
+ * If far_offset is true, then the far_copies are handled a bit differently.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true. In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size. The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array. This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ * A B C D A B C D E
+ * ... ...
+ * D A B C E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ * [A B] [C D] [A B] [C D E]
+ * |...| |...| |...| | ... |
+ * [B A] [D C] [B A] [E C D]
+ */
+
+/*
+ * Number of guaranteed r10bios in case of extreme VM load:
+ */
+#define NR_RAID10_BIOS 256
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
+/* When there are this many requests queued to be written by
+ * the raid10 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
+
+static void allow_barrier(struct r10conf *conf);
+static void lower_barrier(struct r10conf *conf);
+static int _enough(struct r10conf *conf, int previous, int ignore);
+static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
+ int *skipped);
+static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
+static void end_reshape_write(struct bio *bio, int error);
+static void end_reshape(struct r10conf *conf);
+
+static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
+{
+ struct r10conf *conf = data;
+ int size = offsetof(struct r10bio, devs[conf->copies]);
+
+ /* allocate a r10bio with room for raid_disks entries in the
+ * bios array */
+ return kzalloc(size, gfp_flags);
+}
+
+static void r10bio_pool_free(void *r10_bio, void *data)
+{
+ kfree(r10_bio);
+}
+
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+/* amount of memory to reserve for resync requests */
+#define RESYNC_WINDOW (1024*1024)
+/* maximum number of concurrent requests, memory permitting */
+#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
+
+/*
+ * When performing a resync, we need to read and compare, so
+ * we need as many pages are there are copies.
+ * When performing a recovery, we need 2 bios, one for read,
+ * one for write (we recover only one drive per r10buf)
+ *
+ */
+static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
+{
+ struct r10conf *conf = data;
+ struct page *page;
+ struct r10bio *r10_bio;
+ struct bio *bio;
+ int i, j;
+ int nalloc;
+
+ r10_bio = r10bio_pool_alloc(gfp_flags, conf);
+ if (!r10_bio)
+ return NULL;
+
+ if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
+ test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
+ nalloc = conf->copies; /* resync */
+ else
+ nalloc = 2; /* recovery */
+
+ /*
+ * Allocate bios.
+ */
+ for (j = nalloc ; j-- ; ) {
+ bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
+ if (!bio)
+ goto out_free_bio;
+ r10_bio->devs[j].bio = bio;
+ if (!conf->have_replacement)
+ continue;
+ bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
+ if (!bio)
+ goto out_free_bio;
+ r10_bio->devs[j].repl_bio = bio;
+ }
+ /*
+ * Allocate RESYNC_PAGES data pages and attach them
+ * where needed.
+ */
+ for (j = 0 ; j < nalloc; j++) {
+ struct bio *rbio = r10_bio->devs[j].repl_bio;
+ bio = r10_bio->devs[j].bio;
+ for (i = 0; i < RESYNC_PAGES; i++) {
+ if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
+ &conf->mddev->recovery)) {
+ /* we can share bv_page's during recovery
+ * and reshape */
+ struct bio *rbio = r10_bio->devs[0].bio;
+ page = rbio->bi_io_vec[i].bv_page;
+ get_page(page);
+ } else
+ page = alloc_page(gfp_flags);
+ if (unlikely(!page))
+ goto out_free_pages;
+
+ bio->bi_io_vec[i].bv_page = page;
+ if (rbio)
+ rbio->bi_io_vec[i].bv_page = page;
+ }
+ }
+
+ return r10_bio;
+
+out_free_pages:
+ for ( ; i > 0 ; i--)
+ safe_put_page(bio->bi_io_vec[i-1].bv_page);
+ while (j--)
+ for (i = 0; i < RESYNC_PAGES ; i++)
+ safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+ j = 0;
+out_free_bio:
+ for ( ; j < nalloc; j++) {
+ if (r10_bio->devs[j].bio)
+ bio_put(r10_bio->devs[j].bio);
+ if (r10_bio->devs[j].repl_bio)
+ bio_put(r10_bio->devs[j].repl_bio);
+ }
+ r10bio_pool_free(r10_bio, conf);
+ return NULL;
+}
+
+static void r10buf_pool_free(void *__r10_bio, void *data)
+{
+ int i;
+ struct r10conf *conf = data;
+ struct r10bio *r10bio = __r10_bio;
+ int j;
+
+ for (j=0; j < conf->copies; j++) {
+ struct bio *bio = r10bio->devs[j].bio;
+ if (bio) {
+ for (i = 0; i < RESYNC_PAGES; i++) {
+ safe_put_page(bio->bi_io_vec[i].bv_page);
+ bio->bi_io_vec[i].bv_page = NULL;
+ }
+ bio_put(bio);
+ }
+ bio = r10bio->devs[j].repl_bio;
+ if (bio)
+ bio_put(bio);
+ }
+ r10bio_pool_free(r10bio, conf);
+}
+
+static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
+{
+ int i;
+
+ for (i = 0; i < conf->copies; i++) {
+ struct bio **bio = & r10_bio->devs[i].bio;
+ if (!BIO_SPECIAL(*bio))
+ bio_put(*bio);
+ *bio = NULL;
+ bio = &r10_bio->devs[i].repl_bio;
+ if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
+ bio_put(*bio);
+ *bio = NULL;
+ }
+}
+
+static void free_r10bio(struct r10bio *r10_bio)
+{
+ struct r10conf *conf = r10_bio->mddev->private;
+
+ put_all_bios(conf, r10_bio);
+ mempool_free(r10_bio, conf->r10bio_pool);
+}
+
+static void put_buf(struct r10bio *r10_bio)
+{
+ struct r10conf *conf = r10_bio->mddev->private;
+
+ mempool_free(r10_bio, conf->r10buf_pool);
+
+ lower_barrier(conf);
+}
+
+static void reschedule_retry(struct r10bio *r10_bio)
+{
+ unsigned long flags;
+ struct mddev *mddev = r10_bio->mddev;
+ struct r10conf *conf = mddev->private;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ list_add(&r10_bio->retry_list, &conf->retry_list);
+ conf->nr_queued ++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ /* wake up frozen array... */
+ wake_up(&conf->wait_barrier);
+
+ md_wakeup_thread(mddev->thread);
+}
+
+/*
+ * raid_end_bio_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid_end_bio_io(struct r10bio *r10_bio)
+{
+ struct bio *bio = r10_bio->master_bio;
+ int done;
+ struct r10conf *conf = r10_bio->mddev->private;
+
+ if (bio->bi_phys_segments) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio->bi_phys_segments--;
+ done = (bio->bi_phys_segments == 0);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ } else
+ done = 1;
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (done) {
+ bio_endio(bio, 0);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+ }
+ free_r10bio(r10_bio);
+}
+
+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int slot, struct r10bio *r10_bio)
+{
+ struct r10conf *conf = r10_bio->mddev->private;
+
+ conf->mirrors[r10_bio->devs[slot].devnum].head_position =
+ r10_bio->devs[slot].addr + (r10_bio->sectors);
+}
+
+/*
+ * Find the disk number which triggered given bio
+ */
+static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
+ struct bio *bio, int *slotp, int *replp)
+{
+ int slot;
+ int repl = 0;
+
+ for (slot = 0; slot < conf->copies; slot++) {
+ if (r10_bio->devs[slot].bio == bio)
+ break;
+ if (r10_bio->devs[slot].repl_bio == bio) {
+ repl = 1;
+ break;
+ }
+ }
+
+ BUG_ON(slot == conf->copies);
+ update_head_pos(slot, r10_bio);
+
+ if (slotp)
+ *slotp = slot;
+ if (replp)
+ *replp = repl;
+ return r10_bio->devs[slot].devnum;
+}
+
+static void raid10_end_read_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct r10bio *r10_bio = bio->bi_private;
+ int slot, dev;
+ struct md_rdev *rdev;
+ struct r10conf *conf = r10_bio->mddev->private;
+
+ slot = r10_bio->read_slot;
+ dev = r10_bio->devs[slot].devnum;
+ rdev = r10_bio->devs[slot].rdev;
+ /*
+ * this branch is our 'one mirror IO has finished' event handler:
+ */
+ update_head_pos(slot, r10_bio);
+
+ if (uptodate) {
+ /*
+ * Set R10BIO_Uptodate in our master bio, so that
+ * we will return a good error code to the higher
+ * levels even if IO on some other mirrored buffer fails.
+ *
+ * The 'master' represents the composite IO operation to
+ * user-side. So if something waits for IO, then it will
+ * wait for the 'master' bio.
+ */
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+ } else {
+ /* If all other devices that store this block have
+ * failed, we want to return the error upwards rather
+ * than fail the last device. Here we redefine
+ * "uptodate" to mean "Don't want to retry"
+ */
+ if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
+ rdev->raid_disk))
+ uptodate = 1;
+ }
+ if (uptodate) {
+ raid_end_bio_io(r10_bio);
+ rdev_dec_pending(rdev, conf->mddev);
+ } else {
+ /*
+ * oops, read error - keep the refcount on the rdev
+ */
+ char b[BDEVNAME_SIZE];
+ printk_ratelimited(KERN_ERR
+ "md/raid10:%s: %s: rescheduling sector %llu\n",
+ mdname(conf->mddev),
+ bdevname(rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
+ set_bit(R10BIO_ReadError, &r10_bio->state);
+ reschedule_retry(r10_bio);
+ }
+}
+
+static void close_write(struct r10bio *r10_bio)
+{
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
+ r10_bio->sectors,
+ !test_bit(R10BIO_Degraded, &r10_bio->state),
+ 0);
+ md_write_end(r10_bio->mddev);
+}
+
+static void one_write_done(struct r10bio *r10_bio)
+{
+ if (atomic_dec_and_test(&r10_bio->remaining)) {
+ if (test_bit(R10BIO_WriteError, &r10_bio->state))
+ reschedule_retry(r10_bio);
+ else {
+ close_write(r10_bio);
+ if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+ reschedule_retry(r10_bio);
+ else
+ raid_end_bio_io(r10_bio);
+ }
+ }
+}
+
+static void raid10_end_write_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct r10bio *r10_bio = bio->bi_private;
+ int dev;
+ int dec_rdev = 1;
+ struct r10conf *conf = r10_bio->mddev->private;
+ int slot, repl;
+ struct md_rdev *rdev = NULL;
+
+ dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+
+ if (repl)
+ rdev = conf->mirrors[dev].replacement;
+ if (!rdev) {
+ smp_rmb();
+ repl = 0;
+ rdev = conf->mirrors[dev].rdev;
+ }
+ /*
+ * this branch is our 'one mirror IO has finished' event handler:
+ */
+ if (!uptodate) {
+ if (repl)
+ /* Never record new bad blocks to replacement,
+ * just fail it.
+ */
+ md_error(rdev->mddev, rdev);
+ else {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ set_bit(R10BIO_WriteError, &r10_bio->state);
+ dec_rdev = 0;
+ }
+ } else {
+ /*
+ * Set R10BIO_Uptodate in our master bio, so that
+ * we will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer fails.
+ *
+ * The 'master' represents the composite IO operation to
+ * user-side. So if something waits for IO, then it will
+ * wait for the 'master' bio.
+ */
+ sector_t first_bad;
+ int bad_sectors;
+
+ /*
+ * Do not set R10BIO_Uptodate if the current device is
+ * rebuilding or Faulty. This is because we cannot use
+ * such device for properly reading the data back (we could
+ * potentially use it, if the current write would have felt
+ * before rdev->recovery_offset, but for simplicity we don't
+ * check this here.
+ */
+ if (test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags))
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+
+ /* Maybe we can clear some bad blocks. */
+ if (is_badblock(rdev,
+ r10_bio->devs[slot].addr,
+ r10_bio->sectors,
+ &first_bad, &bad_sectors)) {
+ bio_put(bio);
+ if (repl)
+ r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
+ else
+ r10_bio->devs[slot].bio = IO_MADE_GOOD;
+ dec_rdev = 0;
+ set_bit(R10BIO_MadeGood, &r10_bio->state);
+ }
+ }
+
+ /*
+ *
+ * Let's see if all mirrored write operations have finished
+ * already.
+ */
+ one_write_done(r10_bio);
+ if (dec_rdev)
+ rdev_dec_pending(rdev, conf->mddev);
+}
+
+/*
+ * RAID10 layout manager
+ * As well as the chunksize and raid_disks count, there are two
+ * parameters: near_copies and far_copies.
+ * near_copies * far_copies must be <= raid_disks.
+ * Normally one of these will be 1.
+ * If both are 1, we get raid0.
+ * If near_copies == raid_disks, we get raid1.
+ *
+ * Chunks are laid out in raid0 style with near_copies copies of the
+ * first chunk, followed by near_copies copies of the next chunk and
+ * so on.
+ * If far_copies > 1, then after 1/far_copies of the array has been assigned
+ * as described above, we start again with a device offset of near_copies.
+ * So we effectively have another copy of the whole array further down all
+ * the drives, but with blocks on different drives.
+ * With this layout, and block is never stored twice on the one device.
+ *
+ * raid10_find_phys finds the sector offset of a given virtual sector
+ * on each device that it is on.
+ *
+ * raid10_find_virt does the reverse mapping, from a device and a
+ * sector offset to a virtual address
+ */
+
+static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
+{
+ int n,f;
+ sector_t sector;
+ sector_t chunk;
+ sector_t stripe;
+ int dev;
+ int slot = 0;
+ int last_far_set_start, last_far_set_size;
+
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ last_far_set_size = geo->far_set_size;
+ last_far_set_size += (geo->raid_disks % geo->far_set_size);
+
+ /* now calculate first sector/dev */
+ chunk = r10bio->sector >> geo->chunk_shift;
+ sector = r10bio->sector & geo->chunk_mask;
+
+ chunk *= geo->near_copies;
+ stripe = chunk;
+ dev = sector_div(stripe, geo->raid_disks);
+ if (geo->far_offset)
+ stripe *= geo->far_copies;
+
+ sector += stripe << geo->chunk_shift;
+
+ /* and calculate all the others */
+ for (n = 0; n < geo->near_copies; n++) {
+ int d = dev;
+ int set;
+ sector_t s = sector;
+ r10bio->devs[slot].devnum = d;
+ r10bio->devs[slot].addr = s;
+ slot++;
+
+ for (f = 1; f < geo->far_copies; f++) {
+ set = d / geo->far_set_size;
+ d += geo->near_copies;
+
+ if ((geo->raid_disks % geo->far_set_size) &&
+ (d > last_far_set_start)) {
+ d -= last_far_set_start;
+ d %= last_far_set_size;
+ d += last_far_set_start;
+ } else {
+ d %= geo->far_set_size;
+ d += geo->far_set_size * set;
+ }
+ s += geo->stride;
+ r10bio->devs[slot].devnum = d;
+ r10bio->devs[slot].addr = s;
+ slot++;
+ }
+ dev++;
+ if (dev >= geo->raid_disks) {
+ dev = 0;
+ sector += (geo->chunk_mask + 1);
+ }
+ }
+}
+
+static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
+{
+ struct geom *geo = &conf->geo;
+
+ if (conf->reshape_progress != MaxSector &&
+ ((r10bio->sector >= conf->reshape_progress) !=
+ conf->mddev->reshape_backwards)) {
+ set_bit(R10BIO_Previous, &r10bio->state);
+ geo = &conf->prev;
+ } else
+ clear_bit(R10BIO_Previous, &r10bio->state);
+
+ __raid10_find_phys(geo, r10bio);
+}
+
+static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
+{
+ sector_t offset, chunk, vchunk;
+ /* Never use conf->prev as this is only called during resync
+ * or recovery, so reshape isn't happening
+ */
+ struct geom *geo = &conf->geo;
+ int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+ int far_set_size = geo->far_set_size;
+ int last_far_set_start;
+
+ if (geo->raid_disks % geo->far_set_size) {
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ if (dev >= last_far_set_start) {
+ far_set_size = geo->far_set_size;
+ far_set_size += (geo->raid_disks % geo->far_set_size);
+ far_set_start = last_far_set_start;
+ }
+ }
+
+ offset = sector & geo->chunk_mask;
+ if (geo->far_offset) {
+ int fc;
+ chunk = sector >> geo->chunk_shift;
+ fc = sector_div(chunk, geo->far_copies);
+ dev -= fc * geo->near_copies;
+ if (dev < far_set_start)
+ dev += far_set_size;
+ } else {
+ while (sector >= geo->stride) {
+ sector -= geo->stride;
+ if (dev < (geo->near_copies + far_set_start))
+ dev += far_set_size - geo->near_copies;
+ else
+ dev -= geo->near_copies;
+ }
+ chunk = sector >> geo->chunk_shift;
+ }
+ vchunk = chunk * geo->raid_disks + dev;
+ sector_div(vchunk, geo->near_copies);
+ return (vchunk << geo->chunk_shift) + offset;
+}
+
+/**
+ * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
+ * @mddev: the md device
+ * @bvm: properties of new bio
+ * @biovec: the request that could be merged to it.
+ *
+ * Return amount of bytes we can accept at this offset
+ * This requires checking for end-of-chunk if near_copies != raid_disks,
+ * and for subordinate merge_bvec_fns if merge_check_needed.
+ */
+static int raid10_mergeable_bvec(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct r10conf *conf = mddev->private;
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ int max;
+ unsigned int chunk_sectors;
+ unsigned int bio_sectors = bvm->bi_size >> 9;
+ struct geom *geo = &conf->geo;
+
+ chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
+ if (conf->reshape_progress != MaxSector &&
+ ((sector >= conf->reshape_progress) !=
+ conf->mddev->reshape_backwards))
+ geo = &conf->prev;
+
+ if (geo->near_copies < geo->raid_disks) {
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+ + bio_sectors)) << 9;
+ if (max < 0)
+ /* bio_add cannot handle a negative return */
+ max = 0;
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ } else
+ max = biovec->bv_len;
+
+ if (mddev->merge_check_needed) {
+ struct {
+ struct r10bio r10_bio;
+ struct r10dev devs[conf->copies];
+ } on_stack;
+ struct r10bio *r10_bio = &on_stack.r10_bio;
+ int s;
+ if (conf->reshape_progress != MaxSector) {
+ /* Cannot give any guidance during reshape */
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ return 0;
+ }
+ r10_bio->sector = sector;
+ raid10_find_phys(conf, r10_bio);
+ rcu_read_lock();
+ for (s = 0; s < conf->copies; s++) {
+ int disk = r10_bio->devs[s].devnum;
+ struct md_rdev *rdev = rcu_dereference(
+ conf->mirrors[disk].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = r10_bio->devs[s].addr
+ + rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ rdev = rcu_dereference(conf->mirrors[disk].replacement);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = r10_bio->devs[s].addr
+ + rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ }
+ rcu_read_unlock();
+ }
+ return max;
+}
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, both the normal and the resync IO
+ * completion handlers update this position correctly. If there is no
+ * perfect sequential match then we pick the disk whose head is closest.
+ *
+ * If there are 2 mirrors in the same 2 devices, performance degrades
+ * because position is mirror, not device based.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+
+/*
+ * FIXME: possibly should rethink readbalancing and do it differently
+ * depending on near_copies / far_copies geometry.
+ */
+static struct md_rdev *read_balance(struct r10conf *conf,
+ struct r10bio *r10_bio,
+ int *max_sectors)
+{
+ const sector_t this_sector = r10_bio->sector;
+ int disk, slot;
+ int sectors = r10_bio->sectors;
+ int best_good_sectors;
+ sector_t new_distance, best_dist;
+ struct md_rdev *best_rdev, *rdev = NULL;
+ int do_balance;
+ int best_slot;
+ struct geom *geo = &conf->geo;
+
+ raid10_find_phys(conf, r10_bio);
+ rcu_read_lock();
+retry:
+ sectors = r10_bio->sectors;
+ best_slot = -1;
+ best_rdev = NULL;
+ best_dist = MaxSector;
+ best_good_sectors = 0;
+ do_balance = 1;
+ /*
+ * Check if we can balance. We can balance on the whole
+ * device if no resync is going on (recovery is ok), or below
+ * the resync window. We take the first readable disk when
+ * above the resync window.
+ */
+ if (conf->mddev->recovery_cp < MaxSector
+ && (this_sector + sectors >= conf->next_resync))
+ do_balance = 0;
+
+ for (slot = 0; slot < conf->copies ; slot++) {
+ sector_t first_bad;
+ int bad_sectors;
+ sector_t dev_sector;
+
+ if (r10_bio->devs[slot].bio == IO_BLOCKED)
+ continue;
+ disk = r10_bio->devs[slot].devnum;
+ rdev = rcu_dereference(conf->mirrors[disk].replacement);
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+ test_bit(Unmerged, &rdev->flags) ||
+ r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (rdev == NULL ||
+ test_bit(Faulty, &rdev->flags) ||
+ test_bit(Unmerged, &rdev->flags))
+ continue;
+ if (!test_bit(In_sync, &rdev->flags) &&
+ r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
+ continue;
+
+ dev_sector = r10_bio->devs[slot].addr;
+ if (is_badblock(rdev, dev_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (best_dist < MaxSector)
+ /* Already have a better slot */
+ continue;
+ if (first_bad <= dev_sector) {
+ /* Cannot read here. If this is the
+ * 'primary' device, then we must not read
+ * beyond 'bad_sectors' from another device.
+ */
+ bad_sectors -= (dev_sector - first_bad);
+ if (!do_balance && sectors > bad_sectors)
+ sectors = bad_sectors;
+ if (best_good_sectors > sectors)
+ best_good_sectors = sectors;
+ } else {
+ sector_t good_sectors =
+ first_bad - dev_sector;
+ if (good_sectors > best_good_sectors) {
+ best_good_sectors = good_sectors;
+ best_slot = slot;
+ best_rdev = rdev;
+ }
+ if (!do_balance)
+ /* Must read from here */
+ break;
+ }
+ continue;
+ } else
+ best_good_sectors = sectors;
+
+ if (!do_balance)
+ break;
+
+ /* This optimisation is debatable, and completely destroys
+ * sequential read speed for 'far copies' arrays. So only
+ * keep it for 'near' arrays, and review those later.
+ */
+ if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
+ break;
+
+ /* for far > 1 always use the lowest address */
+ if (geo->far_copies > 1)
+ new_distance = r10_bio->devs[slot].addr;
+ else
+ new_distance = abs(r10_bio->devs[slot].addr -
+ conf->mirrors[disk].head_position);
+ if (new_distance < best_dist) {
+ best_dist = new_distance;
+ best_slot = slot;
+ best_rdev = rdev;
+ }
+ }
+ if (slot >= conf->copies) {
+ slot = best_slot;
+ rdev = best_rdev;
+ }
+
+ if (slot >= 0) {
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ /* Cannot risk returning a device that failed
+ * before we inc'ed nr_pending
+ */
+ rdev_dec_pending(rdev, conf->mddev);
+ goto retry;
+ }
+ r10_bio->read_slot = slot;
+ } else
+ rdev = NULL;
+ rcu_read_unlock();
+ *max_sectors = best_good_sectors;
+
+ return rdev;
+}
+
+static int raid10_congested(struct mddev *mddev, int bits)
+{
+ struct r10conf *conf = mddev->private;
+ int i, ret = 0;
+
+ if ((bits & (1 << BDI_async_congested)) &&
+ conf->pending_count >= max_queued_requests)
+ return 1;
+
+ rcu_read_lock();
+ for (i = 0;
+ (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
+ && ret == 0;
+ i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static void flush_pending_writes(struct r10conf *conf)
+{
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ */
+ spin_lock_irq(&conf->device_lock);
+
+ if (conf->pending_bio_list.head) {
+ struct bio *bio;
+ bio = bio_list_get(&conf->pending_bio_list);
+ conf->pending_count = 0;
+ spin_unlock_irq(&conf->device_lock);
+ /* flush any pending bitmap writes to disk
+ * before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
+ } else
+ spin_unlock_irq(&conf->device_lock);
+}
+
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down. This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'. When that returns there
+ * is no backgroup IO happening, It must arrange to call
+ * allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier. Once that returns
+ * there is no normal IO happeing. It must arrange to call
+ * lower_barrier when the particular background IO completes.
+ */
+
+static void raise_barrier(struct r10conf *conf, int force)
+{
+ BUG_ON(force && !conf->barrier);
+ spin_lock_irq(&conf->resync_lock);
+
+ /* Wait until no block IO is waiting (unless 'force') */
+ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
+ conf->resync_lock);
+
+ /* block any new IO from starting */
+ conf->barrier++;
+
+ /* Now wait for all pending IO to complete */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+ conf->resync_lock);
+
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(struct r10conf *conf)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->resync_lock, flags);
+ conf->barrier--;
+ spin_unlock_irqrestore(&conf->resync_lock, flags);
+ wake_up(&conf->wait_barrier);
+}
+
+static void wait_barrier(struct r10conf *conf)
+{
+ spin_lock_irq(&conf->resync_lock);
+ if (conf->barrier) {
+ conf->nr_waiting++;
+ /* Wait for the barrier to drop.
+ * However if there are already pending
+ * requests (preventing the barrier from
+ * rising completely), and the
+ * pre-process bio queue isn't empty,
+ * then don't wait, as we need to empty
+ * that queue to get the nr_pending
+ * count down.
+ */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->barrier ||
+ (conf->nr_pending &&
+ current->bio_list &&
+ !bio_list_empty(current->bio_list)),
+ conf->resync_lock);
+ conf->nr_waiting--;
+ }
+ conf->nr_pending++;
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void allow_barrier(struct r10conf *conf)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->resync_lock, flags);
+ conf->nr_pending--;
+ spin_unlock_irqrestore(&conf->resync_lock, flags);
+ wake_up(&conf->wait_barrier);
+}
+
+static void freeze_array(struct r10conf *conf, int extra)
+{
+ /* stop syncio and normal IO and wait for everything to
+ * go quiet.
+ * We increment barrier and nr_waiting, and then
+ * wait until nr_pending match nr_queued+extra
+ * This is called in the context of one normal IO request
+ * that has failed. Thus any sync request that might be pending
+ * will be blocked by nr_pending, and we need to wait for
+ * pending IO requests to complete or be queued for re-try.
+ * Thus the number queued (nr_queued) plus this request (extra)
+ * must match the number of pending IOs (nr_pending) before
+ * we continue.
+ */
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier++;
+ conf->nr_waiting++;
+ wait_event_lock_irq_cmd(conf->wait_barrier,
+ conf->nr_pending == conf->nr_queued+extra,
+ conf->resync_lock,
+ flush_pending_writes(conf));
+
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void unfreeze_array(struct r10conf *conf)
+{
+ /* reverse the effect of the freeze */
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier--;
+ conf->nr_waiting--;
+ wake_up(&conf->wait_barrier);
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static sector_t choose_data_offset(struct r10bio *r10_bio,
+ struct md_rdev *rdev)
+{
+ if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
+ test_bit(R10BIO_Previous, &r10_bio->state))
+ return rdev->data_offset;
+ else
+ return rdev->new_data_offset;
+}
+
+struct raid10_plug_cb {
+ struct blk_plug_cb cb;
+ struct bio_list pending;
+ int pending_cnt;
+};
+
+static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
+ cb);
+ struct mddev *mddev = plug->cb.data;
+ struct r10conf *conf = mddev->private;
+ struct bio *bio;
+
+ if (from_schedule || current->bio_list) {
+ spin_lock_irq(&conf->device_lock);
+ bio_list_merge(&conf->pending_bio_list, &plug->pending);
+ conf->pending_count += plug->pending_cnt;
+ spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
+ md_wakeup_thread(mddev->thread);
+ kfree(plug);
+ return;
+ }
+
+ /* we aren't scheduling, so we can do the write-out directly. */
+ bio = bio_list_get(&plug->pending);
+ bitmap_unplug(mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
+ kfree(plug);
+}
+
+static void __make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct r10conf *conf = mddev->private;
+ struct r10bio *r10_bio;
+ struct bio *read_bio;
+ int i;
+ const int rw = bio_data_dir(bio);
+ const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
+ const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
+ const unsigned long do_discard = (bio->bi_rw
+ & (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
+ unsigned long flags;
+ struct md_rdev *blocked_rdev;
+ struct blk_plug_cb *cb;
+ struct raid10_plug_cb *plug = NULL;
+ int sectors_handled;
+ int max_sectors;
+ int sectors;
+
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
+ wait_barrier(conf);
+
+ sectors = bio_sectors(bio);
+ while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ bio->bi_iter.bi_sector < conf->reshape_progress &&
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
+ /* IO spans the reshape position. Need to wait for
+ * reshape to pass
+ */
+ allow_barrier(conf);
+ wait_event(conf->wait_barrier,
+ conf->reshape_progress <= bio->bi_iter.bi_sector ||
+ conf->reshape_progress >= bio->bi_iter.bi_sector +
+ sectors);
+ wait_barrier(conf);
+ }
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ bio_data_dir(bio) == WRITE &&
+ (mddev->reshape_backwards
+ ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
+ : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
+ bio->bi_iter.bi_sector < conf->reshape_progress))) {
+ /* Need to update reshape_position in metadata */
+ mddev->reshape_position = conf->reshape_progress;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags));
+
+ conf->reshape_safe = mddev->reshape_position;
+ }
+
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = sectors;
+
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_iter.bi_sector;
+ r10_bio->state = 0;
+
+ /* We might need to issue multiple reads to different
+ * devices if there are bad blocks around, so we keep
+ * track of the number of reads in bio->bi_phys_segments.
+ * If this is 0, there is only one r10_bio and no locking
+ * will be needed when the request completes. If it is
+ * non-zero, then it is the number of not-completed requests.
+ */
+ bio->bi_phys_segments = 0;
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+ if (rw == READ) {
+ /*
+ * read balancing logic:
+ */
+ struct md_rdev *rdev;
+ int slot;
+
+read_again:
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ if (!rdev) {
+ raid_end_bio_io(r10_bio);
+ return;
+ }
+ slot = r10_bio->read_slot;
+
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+
+ r10_bio->devs[slot].bio = read_bio;
+ r10_bio->devs[slot].rdev = rdev;
+
+ read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
+ choose_data_offset(r10_bio, rdev);
+ read_bio->bi_bdev = rdev->bdev;
+ read_bio->bi_end_io = raid10_end_read_request;
+ read_bio->bi_rw = READ | do_sync;
+ read_bio->bi_private = r10_bio;
+
+ if (max_sectors < r10_bio->sectors) {
+ /* Could not read all from this device, so we will
+ * need another r10_bio.
+ */
+ sectors_handled = (r10_bio->sector + max_sectors
+ - bio->bi_iter.bi_sector);
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ /* Cannot call generic_make_request directly
+ * as that will be queued in __generic_make_request
+ * and subsequent mempool_alloc might block
+ * waiting for it. so hand bio over to raid10d.
+ */
+ reschedule_retry(r10_bio);
+
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = bio_sectors(bio) - sectors_handled;
+ r10_bio->state = 0;
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_iter.bi_sector +
+ sectors_handled;
+ goto read_again;
+ } else
+ generic_make_request(read_bio);
+ return;
+ }
+
+ /*
+ * WRITE:
+ */
+ if (conf->pending_count >= max_queued_requests) {
+ md_wakeup_thread(mddev->thread);
+ wait_event(conf->wait_barrier,
+ conf->pending_count < max_queued_requests);
+ }
+ /* first select target devices under rcu_lock and
+ * inc refcount on their rdev. Record them by setting
+ * bios[x] to bio
+ * If there are known/acknowledged bad blocks on any device
+ * on which we have seen a write error, we want to avoid
+ * writing to those blocks. This potentially requires several
+ * writes to write around the bad blocks. Each set of writes
+ * gets its own r10_bio with a set of bios attached. The number
+ * of r10_bios is recored in bio->bi_phys_segments just as with
+ * the read case.
+ */
+
+ r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
+ raid10_find_phys(conf, r10_bio);
+retry_write:
+ blocked_rdev = NULL;
+ rcu_read_lock();
+ max_sectors = r10_bio->sectors;
+
+ for (i = 0; i < conf->copies; i++) {
+ int d = r10_bio->devs[i].devnum;
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rrdev = rcu_dereference(
+ conf->mirrors[d].replacement);
+ if (rdev == rrdev)
+ rrdev = NULL;
+ if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+ atomic_inc(&rdev->nr_pending);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+ atomic_inc(&rrdev->nr_pending);
+ blocked_rdev = rrdev;
+ break;
+ }
+ if (rdev && (test_bit(Faulty, &rdev->flags)
+ || test_bit(Unmerged, &rdev->flags)))
+ rdev = NULL;
+ if (rrdev && (test_bit(Faulty, &rrdev->flags)
+ || test_bit(Unmerged, &rrdev->flags)))
+ rrdev = NULL;
+
+ r10_bio->devs[i].bio = NULL;
+ r10_bio->devs[i].repl_bio = NULL;
+
+ if (!rdev && !rrdev) {
+ set_bit(R10BIO_Degraded, &r10_bio->state);
+ continue;
+ }
+ if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
+ sector_t first_bad;
+ sector_t dev_sector = r10_bio->devs[i].addr;
+ int bad_sectors;
+ int is_bad;
+
+ is_bad = is_badblock(rdev, dev_sector,
+ max_sectors,
+ &first_bad, &bad_sectors);
+ if (is_bad < 0) {
+ /* Mustn't write here until the bad block
+ * is acknowledged
+ */
+ atomic_inc(&rdev->nr_pending);
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (is_bad && first_bad <= dev_sector) {
+ /* Cannot write here at all */
+ bad_sectors -= (dev_sector - first_bad);
+ if (bad_sectors < max_sectors)
+ /* Mustn't write more than bad_sectors
+ * to other devices yet
+ */
+ max_sectors = bad_sectors;
+ /* We don't set R10BIO_Degraded as that
+ * only applies if the disk is missing,
+ * so it might be re-added, and we want to
+ * know to recover this chunk.
+ * In this case the device is here, and the
+ * fact that this chunk is not in-sync is
+ * recorded in the bad block log.
+ */
+ continue;
+ }
+ if (is_bad) {
+ int good_sectors = first_bad - dev_sector;
+ if (good_sectors < max_sectors)
+ max_sectors = good_sectors;
+ }
+ }
+ if (rdev) {
+ r10_bio->devs[i].bio = bio;
+ atomic_inc(&rdev->nr_pending);
+ }
+ if (rrdev) {
+ r10_bio->devs[i].repl_bio = bio;
+ atomic_inc(&rrdev->nr_pending);
+ }
+ }
+ rcu_read_unlock();
+
+ if (unlikely(blocked_rdev)) {
+ /* Have to wait for this device to get unblocked, then retry */
+ int j;
+ int d;
+
+ for (j = 0; j < i; j++) {
+ if (r10_bio->devs[j].bio) {
+ d = r10_bio->devs[j].devnum;
+ rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+ }
+ if (r10_bio->devs[j].repl_bio) {
+ struct md_rdev *rdev;
+ d = r10_bio->devs[j].devnum;
+ rdev = conf->mirrors[d].replacement;
+ if (!rdev) {
+ /* Race with remove_disk */
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
+ rdev_dec_pending(rdev, mddev);
+ }
+ }
+ allow_barrier(conf);
+ md_wait_for_blocked_rdev(blocked_rdev, mddev);
+ wait_barrier(conf);
+ goto retry_write;
+ }
+
+ if (max_sectors < r10_bio->sectors) {
+ /* We are splitting this into multiple parts, so
+ * we need to prepare for allocating another r10_bio.
+ */
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (bio->bi_phys_segments == 0)
+ bio->bi_phys_segments = 2;
+ else
+ bio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ }
+ sectors_handled = r10_bio->sector + max_sectors -
+ bio->bi_iter.bi_sector;
+
+ atomic_set(&r10_bio->remaining, 1);
+ bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
+
+ for (i = 0; i < conf->copies; i++) {
+ struct bio *mbio;
+ int d = r10_bio->devs[i].devnum;
+ if (r10_bio->devs[i].bio) {
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+ r10_bio->devs[i].bio = mbio;
+
+ mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
+ choose_data_offset(r10_bio,
+ rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
+ mbio->bi_private = r10_bio;
+
+ atomic_inc(&r10_bio->remaining);
+
+ cb = blk_check_plugged(raid10_unplug, mddev,
+ sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid10_plug_cb,
+ cb);
+ else
+ plug = NULL;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!plug)
+ md_wakeup_thread(mddev->thread);
+ }
+
+ if (r10_bio->devs[i].repl_bio) {
+ struct md_rdev *rdev = conf->mirrors[d].replacement;
+ if (rdev == NULL) {
+ /* Replacement just got moved to main 'rdev' */
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
+ max_sectors);
+ r10_bio->devs[i].repl_bio = mbio;
+
+ mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
+ choose_data_offset(
+ r10_bio, rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
+ mbio->bi_private = r10_bio;
+
+ atomic_inc(&r10_bio->remaining);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!mddev_check_plugged(mddev))
+ md_wakeup_thread(mddev->thread);
+ }
+ }
+
+ /* Don't remove the bias on 'remaining' (one_write_done) until
+ * after checking if we need to go around again.
+ */
+
+ if (sectors_handled < bio_sectors(bio)) {
+ one_write_done(r10_bio);
+ /* We need another r10_bio. It has already been counted
+ * in bio->bi_phys_segments.
+ */
+ r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = bio_sectors(bio) - sectors_handled;
+
+ r10_bio->mddev = mddev;
+ r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+ r10_bio->state = 0;
+ goto retry_write;
+ }
+ one_write_done(r10_bio);
+}
+
+static void make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct r10conf *conf = mddev->private;
+ sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
+ int chunk_sects = chunk_mask + 1;
+
+ struct bio *split;
+
+ if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bio);
+ return;
+ }
+
+ md_write_start(mddev, bio);
+
+ do {
+
+ /*
+ * If this request crosses a chunk boundary, we need to split
+ * it.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+ bio_sectors(bio) > chunk_sects
+ && (conf->geo.near_copies < conf->geo.raid_disks
+ || conf->prev.near_copies <
+ conf->prev.raid_disks))) {
+ split = bio_split(bio, chunk_sects -
+ (bio->bi_iter.bi_sector &
+ (chunk_sects - 1)),
+ GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
+
+ __make_request(mddev, split);
+ } while (split != bio);
+
+ /* In case raid10d snuck in to freeze_array */
+ wake_up(&conf->wait_barrier);
+}
+
+static void status(struct seq_file *seq, struct mddev *mddev)
+{
+ struct r10conf *conf = mddev->private;
+ int i;
+
+ if (conf->geo.near_copies < conf->geo.raid_disks)
+ seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
+ if (conf->geo.near_copies > 1)
+ seq_printf(seq, " %d near-copies", conf->geo.near_copies);
+ if (conf->geo.far_copies > 1) {
+ if (conf->geo.far_offset)
+ seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
+ else
+ seq_printf(seq, " %d far-copies", conf->geo.far_copies);
+ }
+ seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
+ conf->geo.raid_disks - mddev->degraded);
+ for (i = 0; i < conf->geo.raid_disks; i++)
+ seq_printf(seq, "%s",
+ conf->mirrors[i].rdev &&
+ test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
+ seq_printf(seq, "]");
+}
+
+/* check if there are enough drives for
+ * every block to appear on atleast one.
+ * Don't consider the device numbered 'ignore'
+ * as we might be about to remove it.
+ */
+static int _enough(struct r10conf *conf, int previous, int ignore)
+{
+ int first = 0;
+ int has_enough = 0;
+ int disks, ncopies;
+ if (previous) {
+ disks = conf->prev.raid_disks;
+ ncopies = conf->prev.near_copies;
+ } else {
+ disks = conf->geo.raid_disks;
+ ncopies = conf->geo.near_copies;
+ }
+
+ rcu_read_lock();
+ do {
+ int n = conf->copies;
+ int cnt = 0;
+ int this = first;
+ while (n--) {
+ struct md_rdev *rdev;
+ if (this != ignore &&
+ (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
+ test_bit(In_sync, &rdev->flags))
+ cnt++;
+ this = (this+1) % disks;
+ }
+ if (cnt == 0)
+ goto out;
+ first = (first + ncopies) % disks;
+ } while (first != 0);
+ has_enough = 1;
+out:
+ rcu_read_unlock();
+ return has_enough;
+}
+
+static int enough(struct r10conf *conf, int ignore)
+{
+ /* when calling 'enough', both 'prev' and 'geo' must
+ * be stable.
+ * This is ensured if ->reconfig_mutex or ->device_lock
+ * is held.
+ */
+ return _enough(conf, 0, ignore) &&
+ _enough(conf, 1, ignore);
+}
+
+static void error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ struct r10conf *conf = mddev->private;
+ unsigned long flags;
+
+ /*
+ * If it is not operational, then we have already marked it as dead
+ * else if it is the last working disks, ignore the error, let the
+ * next level up know.
+ * else mark the drive as failed
+ */
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (test_bit(In_sync, &rdev->flags)
+ && !enough(conf, rdev->raid_disk)) {
+ /*
+ * Don't fail the drive, just return an IO error.
+ */
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ return;
+ }
+ if (test_and_clear_bit(In_sync, &rdev->flags))
+ mddev->degraded++;
+ /*
+ * If recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ set_bit(Blocked, &rdev->flags);
+ set_bit(Faulty, &rdev->flags);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ printk(KERN_ALERT
+ "md/raid10:%s: Disk failure on %s, disabling device.\n"
+ "md/raid10:%s: Operation continuing on %d devices.\n",
+ mdname(mddev), bdevname(rdev->bdev, b),
+ mdname(mddev), conf->geo.raid_disks - mddev->degraded);
+}
+
+static void print_conf(struct r10conf *conf)
+{
+ int i;
+ struct raid10_info *tmp;
+
+ printk(KERN_DEBUG "RAID10 conf printout:\n");
+ if (!conf) {
+ printk(KERN_DEBUG "(!conf)\n");
+ return;
+ }
+ printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
+ conf->geo.raid_disks);
+
+ for (i = 0; i < conf->geo.raid_disks; i++) {
+ char b[BDEVNAME_SIZE];
+ tmp = conf->mirrors + i;
+ if (tmp->rdev)
+ printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
+ i, !test_bit(In_sync, &tmp->rdev->flags),
+ !test_bit(Faulty, &tmp->rdev->flags),
+ bdevname(tmp->rdev->bdev,b));
+ }
+}
+
+static void close_sync(struct r10conf *conf)
+{
+ wait_barrier(conf);
+ allow_barrier(conf);
+
+ mempool_destroy(conf->r10buf_pool);
+ conf->r10buf_pool = NULL;
+}
+
+static int raid10_spare_active(struct mddev *mddev)
+{
+ int i;
+ struct r10conf *conf = mddev->private;
+ struct raid10_info *tmp;
+ int count = 0;
+ unsigned long flags;
+
+ /*
+ * Find all non-in_sync disks within the RAID10 configuration
+ * and mark them in_sync
+ */
+ for (i = 0; i < conf->geo.raid_disks; i++) {
+ tmp = conf->mirrors + i;
+ if (tmp->replacement
+ && tmp->replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &tmp->replacement->flags)
+ && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ /* Replacement has just become active */
+ if (!tmp->rdev
+ || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ count++;
+ if (tmp->rdev) {
+ /* Replaced device not technically faulty,
+ * but we need to be sure it gets removed
+ * and never re-added.
+ */
+ set_bit(Faulty, &tmp->rdev->flags);
+ sysfs_notify_dirent_safe(
+ tmp->rdev->sysfs_state);
+ }
+ sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+ } else if (tmp->rdev
+ && tmp->rdev->recovery_offset == MaxSector
+ && !test_bit(Faulty, &tmp->rdev->flags)
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ count++;
+ sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
+ }
+ }
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded -= count;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ print_conf(conf);
+ return count;
+}
+
+static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct r10conf *conf = mddev->private;
+ int err = -EEXIST;
+ int mirror;
+ int first = 0;
+ int last = conf->geo.raid_disks - 1;
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+ if (mddev->recovery_cp < MaxSector)
+ /* only hot-add to in-sync arrays, as recovery is
+ * very different from resync
+ */
+ return -EBUSY;
+ if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
+ return -EINVAL;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
+
+ if (q->merge_bvec_fn) {
+ set_bit(Unmerged, &rdev->flags);
+ mddev->merge_check_needed = 1;
+ }
+
+ if (rdev->saved_raid_disk >= first &&
+ conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+ mirror = rdev->saved_raid_disk;
+ else
+ mirror = first;
+ for ( ; mirror <= last ; mirror++) {
+ struct raid10_info *p = &conf->mirrors[mirror];
+ if (p->recovery_disabled == mddev->recovery_disabled)
+ continue;
+ if (p->rdev) {
+ if (!test_bit(WantReplacement, &p->rdev->flags) ||
+ p->replacement != NULL)
+ continue;
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = mirror;
+ err = 0;
+ if (mddev->gendisk)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->replacement, rdev);
+ break;
+ }
+
+ if (mddev->gendisk)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+
+ p->head_position = 0;
+ p->recovery_disabled = mddev->recovery_disabled - 1;
+ rdev->raid_disk = mirror;
+ err = 0;
+ if (rdev->saved_raid_disk != mirror)
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->rdev, rdev);
+ break;
+ }
+ if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+ /* Some requests might not have seen this new
+ * merge_bvec_fn. We must wait for them to complete
+ * before merging the device fully.
+ * First we make sure any code which has tested
+ * our function has submitted the request, then
+ * we wait for all outstanding requests to complete.
+ */
+ synchronize_sched();
+ freeze_array(conf, 0);
+ unfreeze_array(conf);
+ clear_bit(Unmerged, &rdev->flags);
+ }
+ md_integrity_add_rdev(rdev, mddev);
+ if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
+ print_conf(conf);
+ return err;
+}
+
+static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct r10conf *conf = mddev->private;
+ int err = 0;
+ int number = rdev->raid_disk;
+ struct md_rdev **rdevp;
+ struct raid10_info *p = conf->mirrors + number;
+
+ print_conf(conf);
+ if (rdev == p->rdev)
+ rdevp = &p->rdev;
+ else if (rdev == p->replacement)
+ rdevp = &p->replacement;
+ else
+ return 0;
+
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ err = -EBUSY;
+ goto abort;
+ }
+ /* Only remove faulty devices if recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->recovery_disabled != p->recovery_disabled &&
+ (!p->replacement || p->replacement == rdev) &&
+ number < conf->geo.raid_disks &&
+ enough(conf, -1)) {
+ err = -EBUSY;
+ goto abort;
+ }
+ *rdevp = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ *rdevp = rdev;
+ goto abort;
+ } else if (p->replacement) {
+ /* We must have just cleared 'rdev' */
+ p->rdev = p->replacement;
+ clear_bit(Replacement, &p->replacement->flags);
+ smp_mb(); /* Make sure other CPUs may see both as identical
+ * but will never see neither -- if they are careful.
+ */
+ p->replacement = NULL;
+ clear_bit(WantReplacement, &rdev->flags);
+ } else
+ /* We might have just remove the Replacement as faulty
+ * Clear the flag just in case
+ */
+ clear_bit(WantReplacement, &rdev->flags);
+
+ err = md_integrity_register(mddev);
+
+abort:
+
+ print_conf(conf);
+ return err;
+}
+
+static void end_sync_read(struct bio *bio, int error)
+{
+ struct r10bio *r10_bio = bio->bi_private;
+ struct r10conf *conf = r10_bio->mddev->private;
+ int d;
+
+ if (bio == r10_bio->master_bio) {
+ /* this is a reshape read */
+ d = r10_bio->read_slot; /* really the read dev */
+ } else
+ d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
+
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+ else
+ /* The write handler will notice the lack of
+ * R10BIO_Uptodate and record any errors etc
+ */
+ atomic_add(r10_bio->sectors,
+ &conf->mirrors[d].rdev->corrected_errors);
+
+ /* for reconstruct, we always reschedule after a read.
+ * for resync, only after all reads
+ */
+ rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
+ if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
+ atomic_dec_and_test(&r10_bio->remaining)) {
+ /* we have read all the blocks,
+ * do the comparison in process context in raid10d
+ */
+ reschedule_retry(r10_bio);
+ }
+}
+
+static void end_sync_request(struct r10bio *r10_bio)
+{
+ struct mddev *mddev = r10_bio->mddev;
+
+ while (atomic_dec_and_test(&r10_bio->remaining)) {
+ if (r10_bio->master_bio == NULL) {
+ /* the primary of several recovery bios */
+ sector_t s = r10_bio->sectors;
+ if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+ test_bit(R10BIO_WriteError, &r10_bio->state))
+ reschedule_retry(r10_bio);
+ else
+ put_buf(r10_bio);
+ md_done_sync(mddev, s, 1);
+ break;
+ } else {
+ struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
+ if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+ test_bit(R10BIO_WriteError, &r10_bio->state))
+ reschedule_retry(r10_bio);
+ else
+ put_buf(r10_bio);
+ r10_bio = r10_bio2;
+ }
+ }
+}
+
+static void end_sync_write(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct r10bio *r10_bio = bio->bi_private;
+ struct mddev *mddev = r10_bio->mddev;
+ struct r10conf *conf = mddev->private;
+ int d;
+ sector_t first_bad;
+ int bad_sectors;
+ int slot;
+ int repl;
+ struct md_rdev *rdev = NULL;
+
+ d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+ if (repl)
+ rdev = conf->mirrors[d].replacement;
+ else
+ rdev = conf->mirrors[d].rdev;
+
+ if (!uptodate) {
+ if (repl)
+ md_error(mddev, rdev);
+ else {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ set_bit(R10BIO_WriteError, &r10_bio->state);
+ }
+ } else if (is_badblock(rdev,
+ r10_bio->devs[slot].addr,
+ r10_bio->sectors,
+ &first_bad, &bad_sectors))
+ set_bit(R10BIO_MadeGood, &r10_bio->state);
+
+ rdev_dec_pending(rdev, mddev);
+
+ end_sync_request(r10_bio);
+}
+
+/*
+ * Note: sync and recover and handled very differently for raid10
+ * This code is for resync.
+ * For resync, we read through virtual addresses and read all blocks.
+ * If there is any error, we schedule a write. The lowest numbered
+ * drive is authoritative.
+ * However requests come for physical address, so we need to map.
+ * For every physical address there are raid_disks/copies virtual addresses,
+ * which is always are least one, but is not necessarly an integer.
+ * This means that a physical address can span multiple chunks, so we may
+ * have to submit multiple io requests for a single sync request.
+ */
+/*
+ * We check if all blocks are in-sync and only write to blocks that
+ * aren't in sync
+ */
+static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
+{
+ struct r10conf *conf = mddev->private;
+ int i, first;
+ struct bio *tbio, *fbio;
+ int vcnt;
+
+ atomic_set(&r10_bio->remaining, 1);
+
+ /* find the first device with a block */
+ for (i=0; i<conf->copies; i++)
+ if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
+ break;
+
+ if (i == conf->copies)
+ goto done;
+
+ first = i;
+ fbio = r10_bio->devs[i].bio;
+
+ vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
+ /* now find blocks with errors */
+ for (i=0 ; i < conf->copies ; i++) {
+ int j, d;
+
+ tbio = r10_bio->devs[i].bio;
+
+ if (tbio->bi_end_io != end_sync_read)
+ continue;
+ if (i == first)
+ continue;
+ if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
+ /* We know that the bi_io_vec layout is the same for
+ * both 'first' and 'i', so we just compare them.
+ * All vec entries are PAGE_SIZE;
+ */
+ int sectors = r10_bio->sectors;
+ for (j = 0; j < vcnt; j++) {
+ int len = PAGE_SIZE;
+ if (sectors < (len / 512))
+ len = sectors * 512;
+ if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
+ page_address(tbio->bi_io_vec[j].bv_page),
+ len))
+ break;
+ sectors -= len/512;
+ }
+ if (j == vcnt)
+ continue;
+ atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
+ if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ /* Don't fix anything. */
+ continue;
+ }
+ /* Ok, we need to write this bio, either to correct an
+ * inconsistency or to correct an unreadable block.
+ * First we need to fixup bv_offset, bv_len and
+ * bi_vecs, as the read request might have corrupted these
+ */
+ bio_reset(tbio);
+
+ tbio->bi_vcnt = vcnt;
+ tbio->bi_iter.bi_size = r10_bio->sectors << 9;
+ tbio->bi_rw = WRITE;
+ tbio->bi_private = r10_bio;
+ tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
+
+ for (j=0; j < vcnt ; j++) {
+ tbio->bi_io_vec[j].bv_offset = 0;
+ tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
+
+ memcpy(page_address(tbio->bi_io_vec[j].bv_page),
+ page_address(fbio->bi_io_vec[j].bv_page),
+ PAGE_SIZE);
+ }
+ tbio->bi_end_io = end_sync_write;
+
+ d = r10_bio->devs[i].devnum;
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ atomic_inc(&r10_bio->remaining);
+ md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
+
+ tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
+ tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
+ generic_make_request(tbio);
+ }
+
+ /* Now write out to any replacement devices
+ * that are active
+ */
+ for (i = 0; i < conf->copies; i++) {
+ int j, d;
+
+ tbio = r10_bio->devs[i].repl_bio;
+ if (!tbio || !tbio->bi_end_io)
+ continue;
+ if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
+ && r10_bio->devs[i].bio != fbio)
+ for (j = 0; j < vcnt; j++)
+ memcpy(page_address(tbio->bi_io_vec[j].bv_page),
+ page_address(fbio->bi_io_vec[j].bv_page),
+ PAGE_SIZE);
+ d = r10_bio->devs[i].devnum;
+ atomic_inc(&r10_bio->remaining);
+ md_sync_acct(conf->mirrors[d].replacement->bdev,
+ bio_sectors(tbio));
+ generic_make_request(tbio);
+ }
+
+done:
+ if (atomic_dec_and_test(&r10_bio->remaining)) {
+ md_done_sync(mddev, r10_bio->sectors, 1);
+ put_buf(r10_bio);
+ }
+}
+
+/*
+ * Now for the recovery code.
+ * Recovery happens across physical sectors.
+ * We recover all non-is_sync drives by finding the virtual address of
+ * each, and then choose a working drive that also has that virt address.
+ * There is a separate r10_bio for each non-in_sync drive.
+ * Only the first two slots are in use. The first for reading,
+ * The second for writing.
+ *
+ */
+static void fix_recovery_read_error(struct r10bio *r10_bio)
+{
+ /* We got a read error during recovery.
+ * We repeat the read in smaller page-sized sections.
+ * If a read succeeds, write it to the new device or record
+ * a bad block if we cannot.
+ * If a read fails, record a bad block on both old and
+ * new devices.
+ */
+ struct mddev *mddev = r10_bio->mddev;
+ struct r10conf *conf = mddev->private;
+ struct bio *bio = r10_bio->devs[0].bio;
+ sector_t sect = 0;
+ int sectors = r10_bio->sectors;
+ int idx = 0;
+ int dr = r10_bio->devs[0].devnum;
+ int dw = r10_bio->devs[1].devnum;
+
+ while (sectors) {
+ int s = sectors;
+ struct md_rdev *rdev;
+ sector_t addr;
+ int ok;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ rdev = conf->mirrors[dr].rdev;
+ addr = r10_bio->devs[0].addr + sect,
+ ok = sync_page_io(rdev,
+ addr,
+ s << 9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false);
+ if (ok) {
+ rdev = conf->mirrors[dw].rdev;
+ addr = r10_bio->devs[1].addr + sect;
+ ok = sync_page_io(rdev,
+ addr,
+ s << 9,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE, false);
+ if (!ok) {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement,
+ &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ }
+ }
+ if (!ok) {
+ /* We don't worry if we cannot set a bad block -
+ * it really is bad so there is no loss in not
+ * recording it yet
+ */
+ rdev_set_badblocks(rdev, addr, s, 0);
+
+ if (rdev != conf->mirrors[dw].rdev) {
+ /* need bad block on destination too */
+ struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
+ addr = r10_bio->devs[1].addr + sect;
+ ok = rdev_set_badblocks(rdev2, addr, s, 0);
+ if (!ok) {
+ /* just abort the recovery */
+ printk(KERN_NOTICE
+ "md/raid10:%s: recovery aborted"
+ " due to read error\n",
+ mdname(mddev));
+
+ conf->mirrors[dw].recovery_disabled
+ = mddev->recovery_disabled;
+ set_bit(MD_RECOVERY_INTR,
+ &mddev->recovery);
+ break;
+ }
+ }
+ }
+
+ sectors -= s;
+ sect += s;
+ idx++;
+ }
+}
+
+static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
+{
+ struct r10conf *conf = mddev->private;
+ int d;
+ struct bio *wbio, *wbio2;
+
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
+ fix_recovery_read_error(r10_bio);
+ end_sync_request(r10_bio);
+ return;
+ }
+
+ /*
+ * share the pages with the first bio
+ * and submit the write request
+ */
+ d = r10_bio->devs[1].devnum;
+ wbio = r10_bio->devs[1].bio;
+ wbio2 = r10_bio->devs[1].repl_bio;
+ /* Need to test wbio2->bi_end_io before we call
+ * generic_make_request as if the former is NULL,
+ * the latter is free to free wbio2.
+ */
+ if (wbio2 && !wbio2->bi_end_io)
+ wbio2 = NULL;
+ if (wbio->bi_end_io) {
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
+ generic_make_request(wbio);
+ }
+ if (wbio2) {
+ atomic_inc(&conf->mirrors[d].replacement->nr_pending);
+ md_sync_acct(conf->mirrors[d].replacement->bdev,
+ bio_sectors(wbio2));
+ generic_make_request(wbio2);
+ }
+}
+
+/*
+ * Used by fix_read_error() to decay the per rdev read_errors.
+ * We halve the read error count for every hour that has elapsed
+ * since the last recorded read error.
+ *
+ */
+static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct timespec cur_time_mon;
+ unsigned long hours_since_last;
+ unsigned int read_errors = atomic_read(&rdev->read_errors);
+
+ ktime_get_ts(&cur_time_mon);
+
+ if (rdev->last_read_error.tv_sec == 0 &&
+ rdev->last_read_error.tv_nsec == 0) {
+ /* first time we've seen a read error */
+ rdev->last_read_error = cur_time_mon;
+ return;
+ }
+
+ hours_since_last = (cur_time_mon.tv_sec -
+ rdev->last_read_error.tv_sec) / 3600;
+
+ rdev->last_read_error = cur_time_mon;
+
+ /*
+ * if hours_since_last is > the number of bits in read_errors
+ * just set read errors to 0. We do this to avoid
+ * overflowing the shift of read_errors by hours_since_last.
+ */
+ if (hours_since_last >= 8 * sizeof(read_errors))
+ atomic_set(&rdev->read_errors, 0);
+ else
+ atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
+}
+
+static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
+ int sectors, struct page *page, int rw)
+{
+ sector_t first_bad;
+ int bad_sectors;
+
+ if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
+ && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
+ return -1;
+ if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
+ /* success */
+ return 1;
+ if (rw == WRITE) {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ if (!test_and_set_bit(WantReplacement, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ }
+ /* need to record an error - either for the block or the device */
+ if (!rdev_set_badblocks(rdev, sector, sectors, 0))
+ md_error(rdev->mddev, rdev);
+ return 0;
+}
+
+/*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working mirrors.
+ * 2. Updates the raid superblock when problems encounter.
+ * 3. Performs writes following reads for array synchronising.
+ */
+
+static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
+{
+ int sect = 0; /* Offset from r10_bio->sector */
+ int sectors = r10_bio->sectors;
+ struct md_rdev*rdev;
+ int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+ int d = r10_bio->devs[r10_bio->read_slot].devnum;
+
+ /* still own a reference to this rdev, so it cannot
+ * have been cleared recently.
+ */
+ rdev = conf->mirrors[d].rdev;
+
+ if (test_bit(Faulty, &rdev->flags))
+ /* drive has already been failed, just ignore any
+ more fix_read_error() attempts */
+ return;
+
+ check_decay_read_errors(mddev, rdev);
+ atomic_inc(&rdev->read_errors);
+ if (atomic_read(&rdev->read_errors) > max_read_errors) {
+ char b[BDEVNAME_SIZE];
+ bdevname(rdev->bdev, b);
+
+ printk(KERN_NOTICE
+ "md/raid10:%s: %s: Raid device exceeded "
+ "read_error threshold [cur %d:max %d]\n",
+ mdname(mddev), b,
+ atomic_read(&rdev->read_errors), max_read_errors);
+ printk(KERN_NOTICE
+ "md/raid10:%s: %s: Failing raid device\n",
+ mdname(mddev), b);
+ md_error(mddev, conf->mirrors[d].rdev);
+ r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
+ return;
+ }
+
+ while(sectors) {
+ int s = sectors;
+ int sl = r10_bio->read_slot;
+ int success = 0;
+ int start;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ rcu_read_lock();
+ do {
+ sector_t first_bad;
+ int bad_sectors;
+
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (rdev &&
+ !test_bit(Unmerged, &rdev->flags) &&
+ test_bit(In_sync, &rdev->flags) &&
+ is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
+ &first_bad, &bad_sectors) == 0) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ success = sync_page_io(rdev,
+ r10_bio->devs[sl].addr +
+ sect,
+ s<<9,
+ conf->tmppage, READ, false);
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ if (success)
+ break;
+ }
+ sl++;
+ if (sl == conf->copies)
+ sl = 0;
+ } while (!success && sl != r10_bio->read_slot);
+ rcu_read_unlock();
+
+ if (!success) {
+ /* Cannot read from anywhere, just mark the block
+ * as bad on the first device to discourage future
+ * reads.
+ */
+ int dn = r10_bio->devs[r10_bio->read_slot].devnum;
+ rdev = conf->mirrors[dn].rdev;
+
+ if (!rdev_set_badblocks(
+ rdev,
+ r10_bio->devs[r10_bio->read_slot].addr
+ + sect,
+ s, 0)) {
+ md_error(mddev, rdev);
+ r10_bio->devs[r10_bio->read_slot].bio
+ = IO_BLOCKED;
+ }
+ break;
+ }
+
+ start = sl;
+ /* write it back and re-read */
+ rcu_read_lock();
+ while (sl != r10_bio->read_slot) {
+ char b[BDEVNAME_SIZE];
+
+ if (sl==0)
+ sl = conf->copies;
+ sl--;
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (!rdev ||
+ test_bit(Unmerged, &rdev->flags) ||
+ !test_bit(In_sync, &rdev->flags))
+ continue;
+
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ if (r10_sync_page_io(rdev,
+ r10_bio->devs[sl].addr +
+ sect,
+ s, conf->tmppage, WRITE)
+ == 0) {
+ /* Well, this device is dead */
+ printk(KERN_NOTICE
+ "md/raid10:%s: read correction "
+ "write failed"
+ " (%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(
+ sect +
+ choose_data_offset(r10_bio,
+ rdev)),
+ bdevname(rdev->bdev, b));
+ printk(KERN_NOTICE "md/raid10:%s: %s: failing "
+ "drive\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b));
+ }
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ }
+ sl = start;
+ while (sl != r10_bio->read_slot) {
+ char b[BDEVNAME_SIZE];
+
+ if (sl==0)
+ sl = conf->copies;
+ sl--;
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (!rdev ||
+ !test_bit(In_sync, &rdev->flags))
+ continue;
+
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ switch (r10_sync_page_io(rdev,
+ r10_bio->devs[sl].addr +
+ sect,
+ s, conf->tmppage,
+ READ)) {
+ case 0:
+ /* Well, this device is dead */
+ printk(KERN_NOTICE
+ "md/raid10:%s: unable to read back "
+ "corrected sectors"
+ " (%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(
+ sect +
+ choose_data_offset(r10_bio, rdev)),
+ bdevname(rdev->bdev, b));
+ printk(KERN_NOTICE "md/raid10:%s: %s: failing "
+ "drive\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b));
+ break;
+ case 1:
+ printk(KERN_INFO
+ "md/raid10:%s: read error corrected"
+ " (%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(
+ sect +
+ choose_data_offset(r10_bio, rdev)),
+ bdevname(rdev->bdev, b));
+ atomic_add(s, &rdev->corrected_errors);
+ }
+
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ sectors -= s;
+ sect += s;
+ }
+}
+
+static int narrow_write_error(struct r10bio *r10_bio, int i)
+{
+ struct bio *bio = r10_bio->master_bio;
+ struct mddev *mddev = r10_bio->mddev;
+ struct r10conf *conf = mddev->private;
+ struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
+ /* bio has the data to be written to slot 'i' where
+ * we just recently had a write error.
+ * We repeatedly clone the bio and trim down to one block,
+ * then try the write. Where the write fails we record
+ * a bad block.
+ * It is conceivable that the bio doesn't exactly align with
+ * blocks. We must handle this.
+ *
+ * We currently own a reference to the rdev.
+ */
+
+ int block_sectors;
+ sector_t sector;
+ int sectors;
+ int sect_to_write = r10_bio->sectors;
+ int ok = 1;
+
+ if (rdev->badblocks.shift < 0)
+ return 0;
+
+ block_sectors = roundup(1 << rdev->badblocks.shift,
+ bdev_logical_block_size(rdev->bdev) >> 9);
+ sector = r10_bio->sector;
+ sectors = ((r10_bio->sector + block_sectors)
+ & ~(sector_t)(block_sectors - 1))
+ - sector;
+
+ while (sect_to_write) {
+ struct bio *wbio;
+ if (sectors > sect_to_write)
+ sectors = sect_to_write;
+ /* Write at 'sector' for 'sectors' */
+ wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
+ wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
+ choose_data_offset(r10_bio, rdev) +
+ (sector - r10_bio->sector));
+ wbio->bi_bdev = rdev->bdev;
+ if (submit_bio_wait(WRITE, wbio) == 0)
+ /* Failure! */
+ ok = rdev_set_badblocks(rdev, sector,
+ sectors, 0)
+ && ok;
+
+ bio_put(wbio);
+ sect_to_write -= sectors;
+ sector += sectors;
+ sectors = block_sectors;
+ }
+ return ok;
+}
+
+static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
+{
+ int slot = r10_bio->read_slot;
+ struct bio *bio;
+ struct r10conf *conf = mddev->private;
+ struct md_rdev *rdev = r10_bio->devs[slot].rdev;
+ char b[BDEVNAME_SIZE];
+ unsigned long do_sync;
+ int max_sectors;
+
+ /* we got a read error. Maybe the drive is bad. Maybe just
+ * the block and we can fix it.
+ * We freeze all other IO, and try reading the block from
+ * other devices. When we find one, we re-write
+ * and check it that fixes the read error.
+ * This is all done synchronously while the array is
+ * frozen.
+ */
+ bio = r10_bio->devs[slot].bio;
+ bdevname(bio->bi_bdev, b);
+ bio_put(bio);
+ r10_bio->devs[slot].bio = NULL;
+
+ if (mddev->ro == 0) {
+ freeze_array(conf, 1);
+ fix_read_error(conf, mddev, r10_bio);
+ unfreeze_array(conf);
+ } else
+ r10_bio->devs[slot].bio = IO_BLOCKED;
+
+ rdev_dec_pending(rdev, mddev);
+
+read_more:
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ if (rdev == NULL) {
+ printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
+ " read error for block %llu\n",
+ mdname(mddev), b,
+ (unsigned long long)r10_bio->sector);
+ raid_end_bio_io(r10_bio);
+ return;
+ }
+
+ do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
+ slot = r10_bio->read_slot;
+ printk_ratelimited(
+ KERN_ERR
+ "md/raid10:%s: %s: redirecting "
+ "sector %llu to another mirror\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
+ bio = bio_clone_mddev(r10_bio->master_bio,
+ GFP_NOIO, mddev);
+ bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
+ r10_bio->devs[slot].bio = bio;
+ r10_bio->devs[slot].rdev = rdev;
+ bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
+ + choose_data_offset(r10_bio, rdev);
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_rw = READ | do_sync;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = raid10_end_read_request;
+ if (max_sectors < r10_bio->sectors) {
+ /* Drat - have to split this up more */
+ struct bio *mbio = r10_bio->master_bio;
+ int sectors_handled =
+ r10_bio->sector + max_sectors
+ - mbio->bi_iter.bi_sector;
+ r10_bio->sectors = max_sectors;
+ spin_lock_irq(&conf->device_lock);
+ if (mbio->bi_phys_segments == 0)
+ mbio->bi_phys_segments = 2;
+ else
+ mbio->bi_phys_segments++;
+ spin_unlock_irq(&conf->device_lock);
+ generic_make_request(bio);
+
+ r10_bio = mempool_alloc(conf->r10bio_pool,
+ GFP_NOIO);
+ r10_bio->master_bio = mbio;
+ r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
+ r10_bio->state = 0;
+ set_bit(R10BIO_ReadError,
+ &r10_bio->state);
+ r10_bio->mddev = mddev;
+ r10_bio->sector = mbio->bi_iter.bi_sector
+ + sectors_handled;
+
+ goto read_more;
+ } else
+ generic_make_request(bio);
+}
+
+static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
+{
+ /* Some sort of write request has finished and it
+ * succeeded in writing where we thought there was a
+ * bad block. So forget the bad block.
+ * Or possibly if failed and we need to record
+ * a bad block.
+ */
+ int m;
+ struct md_rdev *rdev;
+
+ if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
+ test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+ for (m = 0; m < conf->copies; m++) {
+ int dev = r10_bio->devs[m].devnum;
+ rdev = conf->mirrors[dev].rdev;
+ if (r10_bio->devs[m].bio == NULL)
+ continue;
+ if (test_bit(BIO_UPTODATE,
+ &r10_bio->devs[m].bio->bi_flags)) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0);
+ } else {
+ if (!rdev_set_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0))
+ md_error(conf->mddev, rdev);
+ }
+ rdev = conf->mirrors[dev].replacement;
+ if (r10_bio->devs[m].repl_bio == NULL)
+ continue;
+ if (test_bit(BIO_UPTODATE,
+ &r10_bio->devs[m].repl_bio->bi_flags)) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0);
+ } else {
+ if (!rdev_set_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0))
+ md_error(conf->mddev, rdev);
+ }
+ }
+ put_buf(r10_bio);
+ } else {
+ for (m = 0; m < conf->copies; m++) {
+ int dev = r10_bio->devs[m].devnum;
+ struct bio *bio = r10_bio->devs[m].bio;
+ rdev = conf->mirrors[dev].rdev;
+ if (bio == IO_MADE_GOOD) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0);
+ rdev_dec_pending(rdev, conf->mddev);
+ } else if (bio != NULL &&
+ !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ if (!narrow_write_error(r10_bio, m)) {
+ md_error(conf->mddev, rdev);
+ set_bit(R10BIO_Degraded,
+ &r10_bio->state);
+ }
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ bio = r10_bio->devs[m].repl_bio;
+ rdev = conf->mirrors[dev].replacement;
+ if (rdev && bio == IO_MADE_GOOD) {
+ rdev_clear_badblocks(
+ rdev,
+ r10_bio->devs[m].addr,
+ r10_bio->sectors, 0);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ }
+ if (test_bit(R10BIO_WriteError,
+ &r10_bio->state))
+ close_write(r10_bio);
+ raid_end_bio_io(r10_bio);
+ }
+}
+
+static void raid10d(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct r10bio *r10_bio;
+ unsigned long flags;
+ struct r10conf *conf = mddev->private;
+ struct list_head *head = &conf->retry_list;
+ struct blk_plug plug;
+
+ md_check_recovery(mddev);
+
+ blk_start_plug(&plug);
+ for (;;) {
+
+ flush_pending_writes(conf);
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (list_empty(head)) {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ break;
+ }
+ r10_bio = list_entry(head->prev, struct r10bio, retry_list);
+ list_del(head->prev);
+ conf->nr_queued--;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ mddev = r10_bio->mddev;
+ conf = mddev->private;
+ if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
+ test_bit(R10BIO_WriteError, &r10_bio->state))
+ handle_write_completed(conf, r10_bio);
+ else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
+ reshape_request_write(mddev, r10_bio);
+ else if (test_bit(R10BIO_IsSync, &r10_bio->state))
+ sync_request_write(mddev, r10_bio);
+ else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
+ recovery_request_write(mddev, r10_bio);
+ else if (test_bit(R10BIO_ReadError, &r10_bio->state))
+ handle_read_error(mddev, r10_bio);
+ else {
+ /* just a partial read to be scheduled from a
+ * separate context
+ */
+ int slot = r10_bio->read_slot;
+ generic_make_request(r10_bio->devs[slot].bio);
+ }
+
+ cond_resched();
+ if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+ md_check_recovery(mddev);
+ }
+ blk_finish_plug(&plug);
+}
+
+static int init_resync(struct r10conf *conf)
+{
+ int buffs;
+ int i;
+
+ buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
+ BUG_ON(conf->r10buf_pool);
+ conf->have_replacement = 0;
+ for (i = 0; i < conf->geo.raid_disks; i++)
+ if (conf->mirrors[i].replacement)
+ conf->have_replacement = 1;
+ conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
+ if (!conf->r10buf_pool)
+ return -ENOMEM;
+ conf->next_resync = 0;
+ return 0;
+}
+
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ *
+ * This is achieved by tracking pending requests and a 'barrier' concept
+ * that can be installed to exclude normal IO requests.
+ *
+ * Resync and recovery are handled very differently.
+ * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
+ *
+ * For resync, we iterate over virtual addresses, read all copies,
+ * and update if there are differences. If only one copy is live,
+ * skip it.
+ * For recovery, we iterate over physical addresses, read a good
+ * value for each non-in_sync drive, and over-write.
+ *
+ * So, for recovery we may have several outstanding complex requests for a
+ * given address, one for each out-of-sync device. We model this by allocating
+ * a number of r10_bio structures, one for each out-of-sync device.
+ * As we setup these structures, we collect all bio's together into a list
+ * which we then process collectively to add pages, and then process again
+ * to pass to generic_make_request.
+ *
+ * The r10_bio structures are linked using a borrowed master_bio pointer.
+ * This link is counted in ->remaining. When the r10_bio that points to NULL
+ * has its remaining count decremented to 0, the whole complex operation
+ * is complete.
+ *
+ */
+
+static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
+ int *skipped)
+{
+ struct r10conf *conf = mddev->private;
+ struct r10bio *r10_bio;
+ struct bio *biolist = NULL, *bio;
+ sector_t max_sector, nr_sectors;
+ int i;
+ int max_sync;
+ sector_t sync_blocks;
+ sector_t sectors_skipped = 0;
+ int chunks_skipped = 0;
+ sector_t chunk_mask = conf->geo.chunk_mask;
+
+ if (!conf->r10buf_pool)
+ if (init_resync(conf))
+ return 0;
+
+ /*
+ * Allow skipping a full rebuild for incremental assembly
+ * of a clean array, like RAID1 does.
+ */
+ if (mddev->bitmap == NULL &&
+ mddev->recovery_cp == MaxSector &&
+ mddev->reshape_position == MaxSector &&
+ !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ conf->fullsync == 0) {
+ *skipped = 1;
+ return mddev->dev_sectors - sector_nr;
+ }
+
+ skipped:
+ max_sector = mddev->dev_sectors;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ max_sector = mddev->resync_max_sectors;
+ if (sector_nr >= max_sector) {
+ /* If we aborted, we need to abort the
+ * sync on the 'current' bitmap chucks (there can
+ * be several when recovering multiple devices).
+ * as we may have started syncing it but not finished.
+ * We can find the current address in
+ * mddev->curr_resync, but for recovery,
+ * we need to convert that to several
+ * virtual addresses.
+ */
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+ end_reshape(conf);
+ close_sync(conf);
+ return 0;
+ }
+
+ if (mddev->curr_resync < max_sector) { /* aborted */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else for (i = 0; i < conf->geo.raid_disks; i++) {
+ sector_t sect =
+ raid10_find_virt(conf, mddev->curr_resync, i);
+ bitmap_end_sync(mddev->bitmap, sect,
+ &sync_blocks, 1);
+ }
+ } else {
+ /* completed sync */
+ if ((!mddev->bitmap || conf->fullsync)
+ && conf->have_replacement
+ && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /* Completed a full sync so the replacements
+ * are now fully recovered.
+ */
+ for (i = 0; i < conf->geo.raid_disks; i++)
+ if (conf->mirrors[i].replacement)
+ conf->mirrors[i].replacement
+ ->recovery_offset
+ = MaxSector;
+ }
+ conf->fullsync = 0;
+ }
+ bitmap_close_sync(mddev->bitmap);
+ close_sync(conf);
+ *skipped = 1;
+ return sectors_skipped;
+ }
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ return reshape_request(mddev, sector_nr, skipped);
+
+ if (chunks_skipped >= conf->geo.raid_disks) {
+ /* if there has been nothing to do on any drive,
+ * then there is nothing to do at all..
+ */
+ *skipped = 1;
+ return (max_sector - sector_nr) + sectors_skipped;
+ }
+
+ if (max_sector > mddev->resync_max)
+ max_sector = mddev->resync_max; /* Don't do IO beyond here */
+
+ /* make sure whole request will fit in a chunk - if chunks
+ * are meaningful
+ */
+ if (conf->geo.near_copies < conf->geo.raid_disks &&
+ max_sector > (sector_nr | chunk_mask))
+ max_sector = (sector_nr | chunk_mask) + 1;
+
+ /* Again, very different code for resync and recovery.
+ * Both must result in an r10bio with a list of bios that
+ * have bi_end_io, bi_sector, bi_bdev set,
+ * and bi_private set to the r10bio.
+ * For recovery, we may actually create several r10bios
+ * with 2 bios in each, that correspond to the bios in the main one.
+ * In this case, the subordinate r10bios link back through a
+ * borrowed master_bio pointer, and the counter in the master
+ * includes a ref from each subordinate.
+ */
+ /* First, we decide what to do and set ->bi_end_io
+ * To end_sync_read if we want to read, and
+ * end_sync_write if we will want to write.
+ */
+
+ max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
+ if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /* recovery... the complicated one */
+ int j;
+ r10_bio = NULL;
+
+ for (i = 0 ; i < conf->geo.raid_disks; i++) {
+ int still_degraded;
+ struct r10bio *rb2;
+ sector_t sect;
+ int must_sync;
+ int any_working;
+ struct raid10_info *mirror = &conf->mirrors[i];
+
+ if ((mirror->rdev == NULL ||
+ test_bit(In_sync, &mirror->rdev->flags))
+ &&
+ (mirror->replacement == NULL ||
+ test_bit(Faulty,
+ &mirror->replacement->flags)))
+ continue;
+
+ still_degraded = 0;
+ /* want to reconstruct this device */
+ rb2 = r10_bio;
+ sect = raid10_find_virt(conf, sector_nr, i);
+ if (sect >= mddev->resync_max_sectors) {
+ /* last stripe is not complete - don't
+ * try to recover this sector.
+ */
+ continue;
+ }
+ /* Unless we are doing a full sync, or a replacement
+ * we only need to recover the block if it is set in
+ * the bitmap
+ */
+ must_sync = bitmap_start_sync(mddev->bitmap, sect,
+ &sync_blocks, 1);
+ if (sync_blocks < max_sync)
+ max_sync = sync_blocks;
+ if (!must_sync &&
+ mirror->replacement == NULL &&
+ !conf->fullsync) {
+ /* yep, skip the sync_blocks here, but don't assume
+ * that there will never be anything to do here
+ */
+ chunks_skipped = -1;
+ continue;
+ }
+
+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ r10_bio->state = 0;
+ raise_barrier(conf, rb2 != NULL);
+ atomic_set(&r10_bio->remaining, 0);
+
+ r10_bio->master_bio = (struct bio*)rb2;
+ if (rb2)
+ atomic_inc(&rb2->remaining);
+ r10_bio->mddev = mddev;
+ set_bit(R10BIO_IsRecover, &r10_bio->state);
+ r10_bio->sector = sect;
+
+ raid10_find_phys(conf, r10_bio);
+
+ /* Need to check if the array will still be
+ * degraded
+ */
+ for (j = 0; j < conf->geo.raid_disks; j++)
+ if (conf->mirrors[j].rdev == NULL ||
+ test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
+ still_degraded = 1;
+ break;
+ }
+
+ must_sync = bitmap_start_sync(mddev->bitmap, sect,
+ &sync_blocks, still_degraded);
+
+ any_working = 0;
+ for (j=0; j<conf->copies;j++) {
+ int k;
+ int d = r10_bio->devs[j].devnum;
+ sector_t from_addr, to_addr;
+ struct md_rdev *rdev;
+ sector_t sector, first_bad;
+ int bad_sectors;
+ if (!conf->mirrors[d].rdev ||
+ !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
+ continue;
+ /* This is where we read from */
+ any_working = 1;
+ rdev = conf->mirrors[d].rdev;
+ sector = r10_bio->devs[j].addr;
+
+ if (is_badblock(rdev, sector, max_sync,
+ &first_bad, &bad_sectors)) {
+ if (first_bad > sector)
+ max_sync = first_bad - sector;
+ else {
+ bad_sectors -= (sector
+ - first_bad);
+ if (max_sync > bad_sectors)
+ max_sync = bad_sectors;
+ continue;
+ }
+ }
+ bio = r10_bio->devs[0].bio;
+ bio_reset(bio);
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_read;
+ bio->bi_rw = READ;
+ from_addr = r10_bio->devs[j].addr;
+ bio->bi_iter.bi_sector = from_addr +
+ rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&rdev->nr_pending);
+ /* and we write to 'i' (if not in_sync) */
+
+ for (k=0; k<conf->copies; k++)
+ if (r10_bio->devs[k].devnum == i)
+ break;
+ BUG_ON(k == conf->copies);
+ to_addr = r10_bio->devs[k].addr;
+ r10_bio->devs[0].devnum = d;
+ r10_bio->devs[0].addr = from_addr;
+ r10_bio->devs[1].devnum = i;
+ r10_bio->devs[1].addr = to_addr;
+
+ rdev = mirror->rdev;
+ if (!test_bit(In_sync, &rdev->flags)) {
+ bio = r10_bio->devs[1].bio;
+ bio_reset(bio);
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_iter.bi_sector = to_addr
+ + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&r10_bio->remaining);
+ } else
+ r10_bio->devs[1].bio->bi_end_io = NULL;
+
+ /* and maybe write to replacement */
+ bio = r10_bio->devs[1].repl_bio;
+ if (bio)
+ bio->bi_end_io = NULL;
+ rdev = mirror->replacement;
+ /* Note: if rdev != NULL, then bio
+ * cannot be NULL as r10buf_pool_alloc will
+ * have allocated it.
+ * So the second test here is pointless.
+ * But it keeps semantic-checkers happy, and
+ * this comment keeps human reviewers
+ * happy.
+ */
+ if (rdev == NULL || bio == NULL ||
+ test_bit(Faulty, &rdev->flags))
+ break;
+ bio_reset(bio);
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_iter.bi_sector = to_addr +
+ rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ atomic_inc(&r10_bio->remaining);
+ break;
+ }
+ if (j == conf->copies) {
+ /* Cannot recover, so abort the recovery or
+ * record a bad block */
+ if (any_working) {
+ /* problem is that there are bad blocks
+ * on other device(s)
+ */
+ int k;
+ for (k = 0; k < conf->copies; k++)
+ if (r10_bio->devs[k].devnum == i)
+ break;
+ if (!test_bit(In_sync,
+ &mirror->rdev->flags)
+ && !rdev_set_badblocks(
+ mirror->rdev,
+ r10_bio->devs[k].addr,
+ max_sync, 0))
+ any_working = 0;
+ if (mirror->replacement &&
+ !rdev_set_badblocks(
+ mirror->replacement,
+ r10_bio->devs[k].addr,
+ max_sync, 0))
+ any_working = 0;
+ }
+ if (!any_working) {
+ if (!test_and_set_bit(MD_RECOVERY_INTR,
+ &mddev->recovery))
+ printk(KERN_INFO "md/raid10:%s: insufficient "
+ "working devices for recovery.\n",
+ mdname(mddev));
+ mirror->recovery_disabled
+ = mddev->recovery_disabled;
+ }
+ put_buf(r10_bio);
+ if (rb2)
+ atomic_dec(&rb2->remaining);
+ r10_bio = rb2;
+ break;
+ }
+ }
+ if (biolist == NULL) {
+ while (r10_bio) {
+ struct r10bio *rb2 = r10_bio;
+ r10_bio = (struct r10bio*) rb2->master_bio;
+ rb2->master_bio = NULL;
+ put_buf(rb2);
+ }
+ goto giveup;
+ }
+ } else {
+ /* resync. Schedule a read for every block at this virt offset */
+ int count = 0;
+
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr,
+ &sync_blocks, mddev->degraded) &&
+ !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
+ &mddev->recovery)) {
+ /* We can skip this block */
+ *skipped = 1;
+ return sync_blocks + sectors_skipped;
+ }
+ if (sync_blocks < max_sync)
+ max_sync = sync_blocks;
+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ r10_bio->state = 0;
+
+ r10_bio->mddev = mddev;
+ atomic_set(&r10_bio->remaining, 0);
+ raise_barrier(conf, 0);
+ conf->next_resync = sector_nr;
+
+ r10_bio->master_bio = NULL;
+ r10_bio->sector = sector_nr;
+ set_bit(R10BIO_IsSync, &r10_bio->state);
+ raid10_find_phys(conf, r10_bio);
+ r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
+
+ for (i = 0; i < conf->copies; i++) {
+ int d = r10_bio->devs[i].devnum;
+ sector_t first_bad, sector;
+ int bad_sectors;
+
+ if (r10_bio->devs[i].repl_bio)
+ r10_bio->devs[i].repl_bio->bi_end_io = NULL;
+
+ bio = r10_bio->devs[i].bio;
+ bio_reset(bio);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (conf->mirrors[d].rdev == NULL ||
+ test_bit(Faulty, &conf->mirrors[d].rdev->flags))
+ continue;
+ sector = r10_bio->devs[i].addr;
+ if (is_badblock(conf->mirrors[d].rdev,
+ sector, max_sync,
+ &first_bad, &bad_sectors)) {
+ if (first_bad > sector)
+ max_sync = first_bad - sector;
+ else {
+ bad_sectors -= (sector - first_bad);
+ if (max_sync > bad_sectors)
+ max_sync = bad_sectors;
+ continue;
+ }
+ }
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ atomic_inc(&r10_bio->remaining);
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_read;
+ bio->bi_rw = READ;
+ bio->bi_iter.bi_sector = sector +
+ conf->mirrors[d].rdev->data_offset;
+ bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+ count++;
+
+ if (conf->mirrors[d].replacement == NULL ||
+ test_bit(Faulty,
+ &conf->mirrors[d].replacement->flags))
+ continue;
+
+ /* Need to set up for writing to the replacement */
+ bio = r10_bio->devs[i].repl_bio;
+ bio_reset(bio);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ sector = r10_bio->devs[i].addr;
+ atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+ bio->bi_next = biolist;
+ biolist = bio;
+ bio->bi_private = r10_bio;
+ bio->bi_end_io = end_sync_write;
+ bio->bi_rw = WRITE;
+ bio->bi_iter.bi_sector = sector +
+ conf->mirrors[d].replacement->data_offset;
+ bio->bi_bdev = conf->mirrors[d].replacement->bdev;
+ count++;
+ }
+
+ if (count < 2) {
+ for (i=0; i<conf->copies; i++) {
+ int d = r10_bio->devs[i].devnum;
+ if (r10_bio->devs[i].bio->bi_end_io)
+ rdev_dec_pending(conf->mirrors[d].rdev,
+ mddev);
+ if (r10_bio->devs[i].repl_bio &&
+ r10_bio->devs[i].repl_bio->bi_end_io)
+ rdev_dec_pending(
+ conf->mirrors[d].replacement,
+ mddev);
+ }
+ put_buf(r10_bio);
+ biolist = NULL;
+ goto giveup;
+ }
+ }
+
+ nr_sectors = 0;
+ if (sector_nr + max_sync < max_sector)
+ max_sector = sector_nr + max_sync;
+ do {
+ struct page *page;
+ int len = PAGE_SIZE;
+ if (sector_nr + (len>>9) > max_sector)
+ len = (max_sector - sector_nr) << 9;
+ if (len == 0)
+ break;
+ for (bio= biolist ; bio ; bio=bio->bi_next) {
+ struct bio *bio2;
+ page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+ if (bio_add_page(bio, page, len, 0))
+ continue;
+
+ /* stop here */
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+ for (bio2 = biolist;
+ bio2 && bio2 != bio;
+ bio2 = bio2->bi_next) {
+ /* remove last page from this bio */
+ bio2->bi_vcnt--;
+ bio2->bi_iter.bi_size -= len;
+ __clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
+ }
+ goto bio_full;
+ }
+ nr_sectors += len>>9;
+ sector_nr += len>>9;
+ } while (biolist->bi_vcnt < RESYNC_PAGES);
+ bio_full:
+ r10_bio->sectors = nr_sectors;
+
+ while (biolist) {
+ bio = biolist;
+ biolist = biolist->bi_next;
+
+ bio->bi_next = NULL;
+ r10_bio = bio->bi_private;
+ r10_bio->sectors = nr_sectors;
+
+ if (bio->bi_end_io == end_sync_read) {
+ md_sync_acct(bio->bi_bdev, nr_sectors);
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ generic_make_request(bio);
+ }
+ }
+
+ if (sectors_skipped)
+ /* pretend they weren't skipped, it makes
+ * no important difference in this case
+ */
+ md_done_sync(mddev, sectors_skipped, 1);
+
+ return sectors_skipped + nr_sectors;
+ giveup:
+ /* There is nowhere to write, so all non-sync
+ * drives must be failed or in resync, all drives
+ * have a bad block, so try the next chunk...
+ */
+ if (sector_nr + max_sync < max_sector)
+ max_sector = sector_nr + max_sync;
+
+ sectors_skipped += (max_sector - sector_nr);
+ chunks_skipped ++;
+ sector_nr = max_sector;
+ goto skipped;
+}
+
+static sector_t
+raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ sector_t size;
+ struct r10conf *conf = mddev->private;
+
+ if (!raid_disks)
+ raid_disks = min(conf->geo.raid_disks,
+ conf->prev.raid_disks);
+ if (!sectors)
+ sectors = conf->dev_sectors;
+
+ size = sectors >> conf->geo.chunk_shift;
+ sector_div(size, conf->geo.far_copies);
+ size = size * raid_disks;
+ sector_div(size, conf->geo.near_copies);
+
+ return size << conf->geo.chunk_shift;
+}
+
+static void calc_sectors(struct r10conf *conf, sector_t size)
+{
+ /* Calculate the number of sectors-per-device that will
+ * actually be used, and set conf->dev_sectors and
+ * conf->stride
+ */
+
+ size = size >> conf->geo.chunk_shift;
+ sector_div(size, conf->geo.far_copies);
+ size = size * conf->geo.raid_disks;
+ sector_div(size, conf->geo.near_copies);
+ /* 'size' is now the number of chunks in the array */
+ /* calculate "used chunks per device" */
+ size = size * conf->copies;
+
+ /* We need to round up when dividing by raid_disks to
+ * get the stride size.
+ */
+ size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
+
+ conf->dev_sectors = size << conf->geo.chunk_shift;
+
+ if (conf->geo.far_offset)
+ conf->geo.stride = 1 << conf->geo.chunk_shift;
+ else {
+ sector_div(size, conf->geo.far_copies);
+ conf->geo.stride = size << conf->geo.chunk_shift;
+ }
+}
+
+enum geo_type {geo_new, geo_old, geo_start};
+static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
+{
+ int nc, fc, fo;
+ int layout, chunk, disks;
+ switch (new) {
+ case geo_old:
+ layout = mddev->layout;
+ chunk = mddev->chunk_sectors;
+ disks = mddev->raid_disks - mddev->delta_disks;
+ break;
+ case geo_new:
+ layout = mddev->new_layout;
+ chunk = mddev->new_chunk_sectors;
+ disks = mddev->raid_disks;
+ break;
+ default: /* avoid 'may be unused' warnings */
+ case geo_start: /* new when starting reshape - raid_disks not
+ * updated yet. */
+ layout = mddev->new_layout;
+ chunk = mddev->new_chunk_sectors;
+ disks = mddev->raid_disks + mddev->delta_disks;
+ break;
+ }
+ if (layout >> 18)
+ return -1;
+ if (chunk < (PAGE_SIZE >> 9) ||
+ !is_power_of_2(chunk))
+ return -2;
+ nc = layout & 255;
+ fc = (layout >> 8) & 255;
+ fo = layout & (1<<16);
+ geo->raid_disks = disks;
+ geo->near_copies = nc;
+ geo->far_copies = fc;
+ geo->far_offset = fo;
+ geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
+ geo->chunk_mask = chunk - 1;
+ geo->chunk_shift = ffz(~chunk);
+ return nc*fc;
+}
+
+static struct r10conf *setup_conf(struct mddev *mddev)
+{
+ struct r10conf *conf = NULL;
+ int err = -EINVAL;
+ struct geom geo;
+ int copies;
+
+ copies = setup_geo(&geo, mddev, geo_new);
+
+ if (copies == -2) {
+ printk(KERN_ERR "md/raid10:%s: chunk size must be "
+ "at least PAGE_SIZE(%ld) and be a power of 2.\n",
+ mdname(mddev), PAGE_SIZE);
+ goto out;
+ }
+
+ if (copies < 2 || copies > mddev->raid_disks) {
+ printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
+ mdname(mddev), mddev->new_layout);
+ goto out;
+ }
+
+ err = -ENOMEM;
+ conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
+ if (!conf)
+ goto out;
+
+ /* FIXME calc properly */
+ conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
+ max(0,-mddev->delta_disks)),
+ GFP_KERNEL);
+ if (!conf->mirrors)
+ goto out;
+
+ conf->tmppage = alloc_page(GFP_KERNEL);
+ if (!conf->tmppage)
+ goto out;
+
+ conf->geo = geo;
+ conf->copies = copies;
+ conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
+ r10bio_pool_free, conf);
+ if (!conf->r10bio_pool)
+ goto out;
+
+ calc_sectors(conf, mddev->dev_sectors);
+ if (mddev->reshape_position == MaxSector) {
+ conf->prev = conf->geo;
+ conf->reshape_progress = MaxSector;
+ } else {
+ if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
+ err = -EINVAL;
+ goto out;
+ }
+ conf->reshape_progress = mddev->reshape_position;
+ if (conf->prev.far_offset)
+ conf->prev.stride = 1 << conf->prev.chunk_shift;
+ else
+ /* far_copies must be 1 */
+ conf->prev.stride = conf->dev_sectors;
+ }
+ spin_lock_init(&conf->device_lock);
+ INIT_LIST_HEAD(&conf->retry_list);
+
+ spin_lock_init(&conf->resync_lock);
+ init_waitqueue_head(&conf->wait_barrier);
+
+ conf->thread = md_register_thread(raid10d, mddev, "raid10");
+ if (!conf->thread)
+ goto out;
+
+ conf->mddev = mddev;
+ return conf;
+
+ out:
+ if (err == -ENOMEM)
+ printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
+ mdname(mddev));
+ if (conf) {
+ if (conf->r10bio_pool)
+ mempool_destroy(conf->r10bio_pool);
+ kfree(conf->mirrors);
+ safe_put_page(conf->tmppage);
+ kfree(conf);
+ }
+ return ERR_PTR(err);
+}
+
+static int run(struct mddev *mddev)
+{
+ struct r10conf *conf;
+ int i, disk_idx, chunk_size;
+ struct raid10_info *disk;
+ struct md_rdev *rdev;
+ sector_t size;
+ sector_t min_offset_diff = 0;
+ int first = 1;
+ bool discard_supported = false;
+
+ if (mddev->private == NULL) {
+ conf = setup_conf(mddev);
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+ mddev->private = conf;
+ }
+ conf = mddev->private;
+ if (!conf)
+ goto out;
+
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+
+ chunk_size = mddev->chunk_sectors << 9;
+ if (mddev->queue) {
+ blk_queue_max_discard_sectors(mddev->queue,
+ mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_io_min(mddev->queue, chunk_size);
+ if (conf->geo.raid_disks % conf->geo.near_copies)
+ blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
+ else
+ blk_queue_io_opt(mddev->queue, chunk_size *
+ (conf->geo.raid_disks / conf->geo.near_copies));
+ }
+
+ rdev_for_each(rdev, mddev) {
+ long long diff;
+ struct request_queue *q;
+
+ disk_idx = rdev->raid_disk;
+ if (disk_idx < 0)
+ continue;
+ if (disk_idx >= conf->geo.raid_disks &&
+ disk_idx >= conf->prev.raid_disks)
+ continue;
+ disk = conf->mirrors + disk_idx;
+
+ if (test_bit(Replacement, &rdev->flags)) {
+ if (disk->replacement)
+ goto out_free_conf;
+ disk->replacement = rdev;
+ } else {
+ if (disk->rdev)
+ goto out_free_conf;
+ disk->rdev = rdev;
+ }
+ q = bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn)
+ mddev->merge_check_needed = 1;
+ diff = (rdev->new_data_offset - rdev->data_offset);
+ if (!mddev->reshape_backwards)
+ diff = -diff;
+ if (diff < 0)
+ diff = 0;
+ if (first || diff < min_offset_diff)
+ min_offset_diff = diff;
+
+ if (mddev->gendisk)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+
+ disk->head_position = 0;
+
+ if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ discard_supported = true;
+ }
+
+ if (mddev->queue) {
+ if (discard_supported)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ }
+ /* need to check that every block has at least one working mirror */
+ if (!enough(conf, -1)) {
+ printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
+ mdname(mddev));
+ goto out_free_conf;
+ }
+
+ if (conf->reshape_progress != MaxSector) {
+ /* must ensure that shape change is supported */
+ if (conf->geo.far_copies != 1 &&
+ conf->geo.far_offset == 0)
+ goto out_free_conf;
+ if (conf->prev.far_copies != 1 &&
+ conf->prev.far_offset == 0)
+ goto out_free_conf;
+ }
+
+ mddev->degraded = 0;
+ for (i = 0;
+ i < conf->geo.raid_disks
+ || i < conf->prev.raid_disks;
+ i++) {
+
+ disk = conf->mirrors + i;
+
+ if (!disk->rdev && disk->replacement) {
+ /* The replacement is all we have - use it */
+ disk->rdev = disk->replacement;
+ disk->replacement = NULL;
+ clear_bit(Replacement, &disk->rdev->flags);
+ }
+
+ if (!disk->rdev ||
+ !test_bit(In_sync, &disk->rdev->flags)) {
+ disk->head_position = 0;
+ mddev->degraded++;
+ if (disk->rdev &&
+ disk->rdev->saved_raid_disk < 0)
+ conf->fullsync = 1;
+ }
+ disk->recovery_disabled = mddev->recovery_disabled - 1;
+ }
+
+ if (mddev->recovery_cp != MaxSector)
+ printk(KERN_NOTICE "md/raid10:%s: not clean"
+ " -- starting background reconstruction\n",
+ mdname(mddev));
+ printk(KERN_INFO
+ "md/raid10:%s: active with %d out of %d devices\n",
+ mdname(mddev), conf->geo.raid_disks - mddev->degraded,
+ conf->geo.raid_disks);
+ /*
+ * Ok, everything is just fine now
+ */
+ mddev->dev_sectors = conf->dev_sectors;
+ size = raid10_size(mddev, 0, 0);
+ md_set_array_sectors(mddev, size);
+ mddev->resync_max_sectors = size;
+
+ if (mddev->queue) {
+ int stripe = conf->geo.raid_disks *
+ ((mddev->chunk_sectors << 9) / PAGE_SIZE);
+
+ /* Calculate max read-ahead size.
+ * We need to readahead at least twice a whole stripe....
+ * maybe...
+ */
+ stripe /= conf->geo.near_copies;
+ if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+ mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ }
+
+ if (md_integrity_register(mddev))
+ goto out_free_conf;
+
+ if (conf->reshape_progress != MaxSector) {
+ unsigned long before_length, after_length;
+
+ before_length = ((1 << conf->prev.chunk_shift) *
+ conf->prev.far_copies);
+ after_length = ((1 << conf->geo.chunk_shift) *
+ conf->geo.far_copies);
+
+ if (max(before_length, after_length) > min_offset_diff) {
+ /* This cannot work */
+ printk("md/raid10: offset difference not enough to continue reshape\n");
+ goto out_free_conf;
+ }
+ conf->offset_diff = min_offset_diff;
+
+ conf->reshape_safe = conf->reshape_progress;
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+ "reshape");
+ }
+
+ return 0;
+
+out_free_conf:
+ md_unregister_thread(&mddev->thread);
+ if (conf->r10bio_pool)
+ mempool_destroy(conf->r10bio_pool);
+ safe_put_page(conf->tmppage);
+ kfree(conf->mirrors);
+ kfree(conf);
+ mddev->private = NULL;
+out:
+ return -EIO;
+}
+
+static void raid10_free(struct mddev *mddev, void *priv)
+{
+ struct r10conf *conf = priv;
+
+ if (conf->r10bio_pool)
+ mempool_destroy(conf->r10bio_pool);
+ safe_put_page(conf->tmppage);
+ kfree(conf->mirrors);
+ kfree(conf->mirrors_old);
+ kfree(conf->mirrors_new);
+ kfree(conf);
+}
+
+static void raid10_quiesce(struct mddev *mddev, int state)
+{
+ struct r10conf *conf = mddev->private;
+
+ switch(state) {
+ case 1:
+ raise_barrier(conf, 0);
+ break;
+ case 0:
+ lower_barrier(conf);
+ break;
+ }
+}
+
+static int raid10_resize(struct mddev *mddev, sector_t sectors)
+{
+ /* Resize of 'far' arrays is not supported.
+ * For 'near' and 'offset' arrays we can set the
+ * number of sectors used to be an appropriate multiple
+ * of the chunk size.
+ * For 'offset', this is far_copies*chunksize.
+ * For 'near' the multiplier is the LCM of
+ * near_copies and raid_disks.
+ * So if far_copies > 1 && !far_offset, fail.
+ * Else find LCM(raid_disks, near_copy)*far_copies and
+ * multiply by chunk_size. Then round to this number.
+ * This is mostly done by raid10_size()
+ */
+ struct r10conf *conf = mddev->private;
+ sector_t oldsize, size;
+
+ if (mddev->reshape_position != MaxSector)
+ return -EBUSY;
+
+ if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
+ return -EINVAL;
+
+ oldsize = raid10_size(mddev, 0, 0);
+ size = raid10_size(mddev, sectors, 0);
+ if (mddev->external_size &&
+ mddev->array_sectors > size)
+ return -EINVAL;
+ if (mddev->bitmap) {
+ int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
+ if (ret)
+ return ret;
+ }
+ md_set_array_sectors(mddev, size);
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ if (sectors > mddev->dev_sectors &&
+ mddev->recovery_cp > oldsize) {
+ mddev->recovery_cp = oldsize;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ calc_sectors(conf, sectors);
+ mddev->dev_sectors = conf->dev_sectors;
+ mddev->resync_max_sectors = size;
+ return 0;
+}
+
+static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
+{
+ struct md_rdev *rdev;
+ struct r10conf *conf;
+
+ if (mddev->degraded > 0) {
+ printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+ sector_div(size, devs);
+
+ /* Set new parameters */
+ mddev->new_level = 10;
+ /* new layout: far_copies = 1, near_copies = 2 */
+ mddev->new_layout = (1<<8) + 2;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->delta_disks = mddev->raid_disks;
+ mddev->raid_disks *= 2;
+ /* make sure it will be not marked as dirty */
+ mddev->recovery_cp = MaxSector;
+ mddev->dev_sectors = size;
+
+ conf = setup_conf(mddev);
+ if (!IS_ERR(conf)) {
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0) {
+ rdev->new_raid_disk = rdev->raid_disk * 2;
+ rdev->sectors = size;
+ }
+ conf->barrier = 1;
+ }
+
+ return conf;
+}
+
+static void *raid10_takeover(struct mddev *mddev)
+{
+ struct r0conf *raid0_conf;
+
+ /* raid10 can take over:
+ * raid0 - providing it has only two drives
+ */
+ if (mddev->level == 0) {
+ /* for raid0 takeover only one zone is supported */
+ raid0_conf = mddev->private;
+ if (raid0_conf->nr_strip_zones > 1) {
+ printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
+ " with more than one zone.\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+ return raid10_takeover_raid0(mddev,
+ raid0_conf->strip_zone->zone_end,
+ raid0_conf->strip_zone->nb_dev);
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static int raid10_check_reshape(struct mddev *mddev)
+{
+ /* Called when there is a request to change
+ * - layout (to ->new_layout)
+ * - chunk size (to ->new_chunk_sectors)
+ * - raid_disks (by delta_disks)
+ * or when trying to restart a reshape that was ongoing.
+ *
+ * We need to validate the request and possibly allocate
+ * space if that might be an issue later.
+ *
+ * Currently we reject any reshape of a 'far' mode array,
+ * allow chunk size to change if new is generally acceptable,
+ * allow raid_disks to increase, and allow
+ * a switch between 'near' mode and 'offset' mode.
+ */
+ struct r10conf *conf = mddev->private;
+ struct geom geo;
+
+ if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
+ return -EINVAL;
+
+ if (setup_geo(&geo, mddev, geo_start) != conf->copies)
+ /* mustn't change number of copies */
+ return -EINVAL;
+ if (geo.far_copies > 1 && !geo.far_offset)
+ /* Cannot switch to 'far' mode */
+ return -EINVAL;
+
+ if (mddev->array_sectors & geo.chunk_mask)
+ /* not factor of array size */
+ return -EINVAL;
+
+ if (!enough(conf, -1))
+ return -EINVAL;
+
+ kfree(conf->mirrors_new);
+ conf->mirrors_new = NULL;
+ if (mddev->delta_disks > 0) {
+ /* allocate new 'mirrors' list */
+ conf->mirrors_new = kzalloc(
+ sizeof(struct raid10_info)
+ *(mddev->raid_disks +
+ mddev->delta_disks),
+ GFP_KERNEL);
+ if (!conf->mirrors_new)
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/*
+ * Need to check if array has failed when deciding whether to:
+ * - start an array
+ * - remove non-faulty devices
+ * - add a spare
+ * - allow a reshape
+ * This determination is simple when no reshape is happening.
+ * However if there is a reshape, we need to carefully check
+ * both the before and after sections.
+ * This is because some failed devices may only affect one
+ * of the two sections, and some non-in_sync devices may
+ * be insync in the section most affected by failed devices.
+ */
+static int calc_degraded(struct r10conf *conf)
+{
+ int degraded, degraded2;
+ int i;
+
+ rcu_read_lock();
+ degraded = 0;
+ /* 'prev' section first */
+ for (i = 0; i < conf->prev.raid_disks; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ degraded++;
+ else if (!test_bit(In_sync, &rdev->flags))
+ /* When we can reduce the number of devices in
+ * an array, this might not contribute to
+ * 'degraded'. It does now.
+ */
+ degraded++;
+ }
+ rcu_read_unlock();
+ if (conf->geo.raid_disks == conf->prev.raid_disks)
+ return degraded;
+ rcu_read_lock();
+ degraded2 = 0;
+ for (i = 0; i < conf->geo.raid_disks; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ degraded2++;
+ else if (!test_bit(In_sync, &rdev->flags)) {
+ /* If reshape is increasing the number of devices,
+ * this section has already been recovered, so
+ * it doesn't contribute to degraded.
+ * else it does.
+ */
+ if (conf->geo.raid_disks <= conf->prev.raid_disks)
+ degraded2++;
+ }
+ }
+ rcu_read_unlock();
+ if (degraded2 > degraded)
+ return degraded2;
+ return degraded;
+}
+
+static int raid10_start_reshape(struct mddev *mddev)
+{
+ /* A 'reshape' has been requested. This commits
+ * the various 'new' fields and sets MD_RECOVER_RESHAPE
+ * This also checks if there are enough spares and adds them
+ * to the array.
+ * We currently require enough spares to make the final
+ * array non-degraded. We also require that the difference
+ * between old and new data_offset - on each device - is
+ * enough that we never risk over-writing.
+ */
+
+ unsigned long before_length, after_length;
+ sector_t min_offset_diff = 0;
+ int first = 1;
+ struct geom new;
+ struct r10conf *conf = mddev->private;
+ struct md_rdev *rdev;
+ int spares = 0;
+ int ret;
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return -EBUSY;
+
+ if (setup_geo(&new, mddev, geo_start) != conf->copies)
+ return -EINVAL;
+
+ before_length = ((1 << conf->prev.chunk_shift) *
+ conf->prev.far_copies);
+ after_length = ((1 << conf->geo.chunk_shift) *
+ conf->geo.far_copies);
+
+ rdev_for_each(rdev, mddev) {
+ if (!test_bit(In_sync, &rdev->flags)
+ && !test_bit(Faulty, &rdev->flags))
+ spares++;
+ if (rdev->raid_disk >= 0) {
+ long long diff = (rdev->new_data_offset
+ - rdev->data_offset);
+ if (!mddev->reshape_backwards)
+ diff = -diff;
+ if (diff < 0)
+ diff = 0;
+ if (first || diff < min_offset_diff)
+ min_offset_diff = diff;
+ }
+ }
+
+ if (max(before_length, after_length) > min_offset_diff)
+ return -EINVAL;
+
+ if (spares < mddev->delta_disks)
+ return -EINVAL;
+
+ conf->offset_diff = min_offset_diff;
+ spin_lock_irq(&conf->device_lock);
+ if (conf->mirrors_new) {
+ memcpy(conf->mirrors_new, conf->mirrors,
+ sizeof(struct raid10_info)*conf->prev.raid_disks);
+ smp_mb();
+ kfree(conf->mirrors_old);
+ conf->mirrors_old = conf->mirrors;
+ conf->mirrors = conf->mirrors_new;
+ conf->mirrors_new = NULL;
+ }
+ setup_geo(&conf->geo, mddev, geo_start);
+ smp_mb();
+ if (mddev->reshape_backwards) {
+ sector_t size = raid10_size(mddev, 0, 0);
+ if (size < mddev->array_sectors) {
+ spin_unlock_irq(&conf->device_lock);
+ printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ mddev->resync_max_sectors = size;
+ conf->reshape_progress = size;
+ } else
+ conf->reshape_progress = 0;
+ spin_unlock_irq(&conf->device_lock);
+
+ if (mddev->delta_disks && mddev->bitmap) {
+ ret = bitmap_resize(mddev->bitmap,
+ raid10_size(mddev, 0,
+ conf->geo.raid_disks),
+ 0, 0);
+ if (ret)
+ goto abort;
+ }
+ if (mddev->delta_disks > 0) {
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk < 0 &&
+ !test_bit(Faulty, &rdev->flags)) {
+ if (raid10_add_disk(mddev, rdev) == 0) {
+ if (rdev->raid_disk >=
+ conf->prev.raid_disks)
+ set_bit(In_sync, &rdev->flags);
+ else
+ rdev->recovery_offset = 0;
+
+ if (sysfs_link_rdev(mddev, rdev))
+ /* Failure here is OK */;
+ }
+ } else if (rdev->raid_disk >= conf->prev.raid_disks
+ && !test_bit(Faulty, &rdev->flags)) {
+ /* This is a spare that was manually added */
+ set_bit(In_sync, &rdev->flags);
+ }
+ }
+ /* When a reshape changes the number of devices,
+ * ->degraded is measured against the larger of the
+ * pre and post numbers.
+ */
+ spin_lock_irq(&conf->device_lock);
+ mddev->degraded = calc_degraded(conf);
+ spin_unlock_irq(&conf->device_lock);
+ mddev->raid_disks = conf->geo.raid_disks;
+ mddev->reshape_position = conf->reshape_progress;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+
+ mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+ "reshape");
+ if (!mddev->sync_thread) {
+ ret = -EAGAIN;
+ goto abort;
+ }
+ conf->reshape_checkpoint = jiffies;
+ md_wakeup_thread(mddev->sync_thread);
+ md_new_event(mddev);
+ return 0;
+
+abort:
+ mddev->recovery = 0;
+ spin_lock_irq(&conf->device_lock);
+ conf->geo = conf->prev;
+ mddev->raid_disks = conf->geo.raid_disks;
+ rdev_for_each(rdev, mddev)
+ rdev->new_data_offset = rdev->data_offset;
+ smp_wmb();
+ conf->reshape_progress = MaxSector;
+ mddev->reshape_position = MaxSector;
+ spin_unlock_irq(&conf->device_lock);
+ return ret;
+}
+
+/* Calculate the last device-address that could contain
+ * any block from the chunk that includes the array-address 's'
+ * and report the next address.
+ * i.e. the address returned will be chunk-aligned and after
+ * any data that is in the chunk containing 's'.
+ */
+static sector_t last_dev_address(sector_t s, struct geom *geo)
+{
+ s = (s | geo->chunk_mask) + 1;
+ s >>= geo->chunk_shift;
+ s *= geo->near_copies;
+ s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
+ s *= geo->far_copies;
+ s <<= geo->chunk_shift;
+ return s;
+}
+
+/* Calculate the first device-address that could contain
+ * any block from the chunk that includes the array-address 's'.
+ * This too will be the start of a chunk
+ */
+static sector_t first_dev_address(sector_t s, struct geom *geo)
+{
+ s >>= geo->chunk_shift;
+ s *= geo->near_copies;
+ sector_div(s, geo->raid_disks);
+ s *= geo->far_copies;
+ s <<= geo->chunk_shift;
+ return s;
+}
+
+static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
+ int *skipped)
+{
+ /* We simply copy at most one chunk (smallest of old and new)
+ * at a time, possibly less if that exceeds RESYNC_PAGES,
+ * or we hit a bad block or something.
+ * This might mean we pause for normal IO in the middle of
+ * a chunk, but that is not a problem was mddev->reshape_position
+ * can record any location.
+ *
+ * If we will want to write to a location that isn't
+ * yet recorded as 'safe' (i.e. in metadata on disk) then
+ * we need to flush all reshape requests and update the metadata.
+ *
+ * When reshaping forwards (e.g. to more devices), we interpret
+ * 'safe' as the earliest block which might not have been copied
+ * down yet. We divide this by previous stripe size and multiply
+ * by previous stripe length to get lowest device offset that we
+ * cannot write to yet.
+ * We interpret 'sector_nr' as an address that we want to write to.
+ * From this we use last_device_address() to find where we might
+ * write to, and first_device_address on the 'safe' position.
+ * If this 'next' write position is after the 'safe' position,
+ * we must update the metadata to increase the 'safe' position.
+ *
+ * When reshaping backwards, we round in the opposite direction
+ * and perform the reverse test: next write position must not be
+ * less than current safe position.
+ *
+ * In all this the minimum difference in data offsets
+ * (conf->offset_diff - always positive) allows a bit of slack,
+ * so next can be after 'safe', but not by more than offset_disk
+ *
+ * We need to prepare all the bios here before we start any IO
+ * to ensure the size we choose is acceptable to all devices.
+ * The means one for each copy for write-out and an extra one for
+ * read-in.
+ * We store the read-in bio in ->master_bio and the others in
+ * ->devs[x].bio and ->devs[x].repl_bio.
+ */
+ struct r10conf *conf = mddev->private;
+ struct r10bio *r10_bio;
+ sector_t next, safe, last;
+ int max_sectors;
+ int nr_sectors;
+ int s;
+ struct md_rdev *rdev;
+ int need_flush = 0;
+ struct bio *blist;
+ struct bio *bio, *read_bio;
+ int sectors_done = 0;
+
+ if (sector_nr == 0) {
+ /* If restarting in the middle, skip the initial sectors */
+ if (mddev->reshape_backwards &&
+ conf->reshape_progress < raid10_size(mddev, 0, 0)) {
+ sector_nr = (raid10_size(mddev, 0, 0)
+ - conf->reshape_progress);
+ } else if (!mddev->reshape_backwards &&
+ conf->reshape_progress > 0)
+ sector_nr = conf->reshape_progress;
+ if (sector_nr) {
+ mddev->curr_resync_completed = sector_nr;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ *skipped = 1;
+ return sector_nr;
+ }
+ }
+
+ /* We don't use sector_nr to track where we are up to
+ * as that doesn't work well for ->reshape_backwards.
+ * So just use ->reshape_progress.
+ */
+ if (mddev->reshape_backwards) {
+ /* 'next' is the earliest device address that we might
+ * write to for this chunk in the new layout
+ */
+ next = first_dev_address(conf->reshape_progress - 1,
+ &conf->geo);
+
+ /* 'safe' is the last device address that we might read from
+ * in the old layout after a restart
+ */
+ safe = last_dev_address(conf->reshape_safe - 1,
+ &conf->prev);
+
+ if (next + conf->offset_diff < safe)
+ need_flush = 1;
+
+ last = conf->reshape_progress - 1;
+ sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
+ & conf->prev.chunk_mask);
+ if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
+ sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
+ } else {
+ /* 'next' is after the last device address that we
+ * might write to for this chunk in the new layout
+ */
+ next = last_dev_address(conf->reshape_progress, &conf->geo);
+
+ /* 'safe' is the earliest device address that we might
+ * read from in the old layout after a restart
+ */
+ safe = first_dev_address(conf->reshape_safe, &conf->prev);
+
+ /* Need to update metadata if 'next' might be beyond 'safe'
+ * as that would possibly corrupt data
+ */
+ if (next > safe + conf->offset_diff)
+ need_flush = 1;
+
+ sector_nr = conf->reshape_progress;
+ last = sector_nr | (conf->geo.chunk_mask
+ & conf->prev.chunk_mask);
+
+ if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
+ last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
+ }
+
+ if (need_flush ||
+ time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
+ /* Need to update reshape_position in metadata */
+ wait_barrier(conf);
+ mddev->reshape_position = conf->reshape_progress;
+ if (mddev->reshape_backwards)
+ mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
+ - conf->reshape_progress;
+ else
+ mddev->curr_resync_completed = conf->reshape_progress;
+ conf->reshape_checkpoint = jiffies;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ wait_event(mddev->sb_wait, mddev->flags == 0 ||
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ allow_barrier(conf);
+ return sectors_done;
+ }
+ conf->reshape_safe = mddev->reshape_position;
+ allow_barrier(conf);
+ }
+
+read_more:
+ /* Now schedule reads for blocks from sector_nr to last */
+ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+ r10_bio->state = 0;
+ raise_barrier(conf, sectors_done != 0);
+ atomic_set(&r10_bio->remaining, 0);
+ r10_bio->mddev = mddev;
+ r10_bio->sector = sector_nr;
+ set_bit(R10BIO_IsReshape, &r10_bio->state);
+ r10_bio->sectors = last - sector_nr + 1;
+ rdev = read_balance(conf, r10_bio, &max_sectors);
+ BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
+
+ if (!rdev) {
+ /* Cannot read from here, so need to record bad blocks
+ * on all the target devices.
+ */
+ // FIXME
+ mempool_free(r10_bio, conf->r10buf_pool);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ return sectors_done;
+ }
+
+ read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
+
+ read_bio->bi_bdev = rdev->bdev;
+ read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ + rdev->data_offset);
+ read_bio->bi_private = r10_bio;
+ read_bio->bi_end_io = end_sync_read;
+ read_bio->bi_rw = READ;
+ read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
+ __set_bit(BIO_UPTODATE, &read_bio->bi_flags);
+ read_bio->bi_vcnt = 0;
+ read_bio->bi_iter.bi_size = 0;
+ r10_bio->master_bio = read_bio;
+ r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
+
+ /* Now find the locations in the new layout */
+ __raid10_find_phys(&conf->geo, r10_bio);
+
+ blist = read_bio;
+ read_bio->bi_next = NULL;
+
+ for (s = 0; s < conf->copies*2; s++) {
+ struct bio *b;
+ int d = r10_bio->devs[s/2].devnum;
+ struct md_rdev *rdev2;
+ if (s&1) {
+ rdev2 = conf->mirrors[d].replacement;
+ b = r10_bio->devs[s/2].repl_bio;
+ } else {
+ rdev2 = conf->mirrors[d].rdev;
+ b = r10_bio->devs[s/2].bio;
+ }
+ if (!rdev2 || test_bit(Faulty, &rdev2->flags))
+ continue;
+
+ bio_reset(b);
+ b->bi_bdev = rdev2->bdev;
+ b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
+ rdev2->new_data_offset;
+ b->bi_private = r10_bio;
+ b->bi_end_io = end_reshape_write;
+ b->bi_rw = WRITE;
+ b->bi_next = blist;
+ blist = b;
+ }
+
+ /* Now add as many pages as possible to all of these bios. */
+
+ nr_sectors = 0;
+ for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
+ struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+ int len = (max_sectors - s) << 9;
+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
+ for (bio = blist; bio ; bio = bio->bi_next) {
+ struct bio *bio2;
+ if (bio_add_page(bio, page, len, 0))
+ continue;
+
+ /* Didn't fit, must stop */
+ for (bio2 = blist;
+ bio2 && bio2 != bio;
+ bio2 = bio2->bi_next) {
+ /* Remove last page from this bio */
+ bio2->bi_vcnt--;
+ bio2->bi_iter.bi_size -= len;
+ __clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
+ }
+ goto bio_full;
+ }
+ sector_nr += len >> 9;
+ nr_sectors += len >> 9;
+ }
+bio_full:
+ r10_bio->sectors = nr_sectors;
+
+ /* Now submit the read */
+ md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
+ atomic_inc(&r10_bio->remaining);
+ read_bio->bi_next = NULL;
+ generic_make_request(read_bio);
+ sector_nr += nr_sectors;
+ sectors_done += nr_sectors;
+ if (sector_nr <= last)
+ goto read_more;
+
+ /* Now that we have done the whole section we can
+ * update reshape_progress
+ */
+ if (mddev->reshape_backwards)
+ conf->reshape_progress -= sectors_done;
+ else
+ conf->reshape_progress += sectors_done;
+
+ return sectors_done;
+}
+
+static void end_reshape_request(struct r10bio *r10_bio);
+static int handle_reshape_read_error(struct mddev *mddev,
+ struct r10bio *r10_bio);
+static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
+{
+ /* Reshape read completed. Hopefully we have a block
+ * to write out.
+ * If we got a read error then we do sync 1-page reads from
+ * elsewhere until we find the data - or give up.
+ */
+ struct r10conf *conf = mddev->private;
+ int s;
+
+ if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+ if (handle_reshape_read_error(mddev, r10_bio) < 0) {
+ /* Reshape has been aborted */
+ md_done_sync(mddev, r10_bio->sectors, 0);
+ return;
+ }
+
+ /* We definitely have the data in the pages, schedule the
+ * writes.
+ */
+ atomic_set(&r10_bio->remaining, 1);
+ for (s = 0; s < conf->copies*2; s++) {
+ struct bio *b;
+ int d = r10_bio->devs[s/2].devnum;
+ struct md_rdev *rdev;
+ if (s&1) {
+ rdev = conf->mirrors[d].replacement;
+ b = r10_bio->devs[s/2].repl_bio;
+ } else {
+ rdev = conf->mirrors[d].rdev;
+ b = r10_bio->devs[s/2].bio;
+ }
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ continue;
+ atomic_inc(&rdev->nr_pending);
+ md_sync_acct(b->bi_bdev, r10_bio->sectors);
+ atomic_inc(&r10_bio->remaining);
+ b->bi_next = NULL;
+ generic_make_request(b);
+ }
+ end_reshape_request(r10_bio);
+}
+
+static void end_reshape(struct r10conf *conf)
+{
+ if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
+ return;
+
+ spin_lock_irq(&conf->device_lock);
+ conf->prev = conf->geo;
+ md_finish_reshape(conf->mddev);
+ smp_wmb();
+ conf->reshape_progress = MaxSector;
+ spin_unlock_irq(&conf->device_lock);
+
+ /* read-ahead size must cover two whole stripes, which is
+ * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
+ */
+ if (conf->mddev->queue) {
+ int stripe = conf->geo.raid_disks *
+ ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
+ stripe /= conf->geo.near_copies;
+ if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+ conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ }
+ conf->fullsync = 0;
+}
+
+static int handle_reshape_read_error(struct mddev *mddev,
+ struct r10bio *r10_bio)
+{
+ /* Use sync reads to get the blocks from somewhere else */
+ int sectors = r10_bio->sectors;
+ struct r10conf *conf = mddev->private;
+ struct {
+ struct r10bio r10_bio;
+ struct r10dev devs[conf->copies];
+ } on_stack;
+ struct r10bio *r10b = &on_stack.r10_bio;
+ int slot = 0;
+ int idx = 0;
+ struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+
+ r10b->sector = r10_bio->sector;
+ __raid10_find_phys(&conf->prev, r10b);
+
+ while (sectors) {
+ int s = sectors;
+ int success = 0;
+ int first_slot = slot;
+
+ if (s > (PAGE_SIZE >> 9))
+ s = PAGE_SIZE >> 9;
+
+ while (!success) {
+ int d = r10b->devs[slot].devnum;
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
+ sector_t addr;
+ if (rdev == NULL ||
+ test_bit(Faulty, &rdev->flags) ||
+ !test_bit(In_sync, &rdev->flags))
+ goto failed;
+
+ addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
+ success = sync_page_io(rdev,
+ addr,
+ s << 9,
+ bvec[idx].bv_page,
+ READ, false);
+ if (success)
+ break;
+ failed:
+ slot++;
+ if (slot >= conf->copies)
+ slot = 0;
+ if (slot == first_slot)
+ break;
+ }
+ if (!success) {
+ /* couldn't read this block, must give up */
+ set_bit(MD_RECOVERY_INTR,
+ &mddev->recovery);
+ return -EIO;
+ }
+ sectors -= s;
+ idx++;
+ }
+ return 0;
+}
+
+static void end_reshape_write(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct r10bio *r10_bio = bio->bi_private;
+ struct mddev *mddev = r10_bio->mddev;
+ struct r10conf *conf = mddev->private;
+ int d;
+ int slot;
+ int repl;
+ struct md_rdev *rdev = NULL;
+
+ d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
+ if (repl)
+ rdev = conf->mirrors[d].replacement;
+ if (!rdev) {
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
+
+ if (!uptodate) {
+ /* FIXME should record badblock */
+ md_error(mddev, rdev);
+ }
+
+ rdev_dec_pending(rdev, mddev);
+ end_reshape_request(r10_bio);
+}
+
+static void end_reshape_request(struct r10bio *r10_bio)
+{
+ if (!atomic_dec_and_test(&r10_bio->remaining))
+ return;
+ md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
+ bio_put(r10_bio->master_bio);
+ put_buf(r10_bio);
+}
+
+static void raid10_finish_reshape(struct mddev *mddev)
+{
+ struct r10conf *conf = mddev->private;
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ return;
+
+ if (mddev->delta_disks > 0) {
+ sector_t size = raid10_size(mddev, 0, 0);
+ md_set_array_sectors(mddev, size);
+ if (mddev->recovery_cp > mddev->resync_max_sectors) {
+ mddev->recovery_cp = mddev->resync_max_sectors;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ mddev->resync_max_sectors = size;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ } else {
+ int d;
+ for (d = conf->geo.raid_disks ;
+ d < conf->geo.raid_disks - mddev->delta_disks;
+ d++) {
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
+ if (rdev)
+ clear_bit(In_sync, &rdev->flags);
+ rdev = conf->mirrors[d].replacement;
+ if (rdev)
+ clear_bit(In_sync, &rdev->flags);
+ }
+ }
+ mddev->layout = mddev->new_layout;
+ mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+}
+
+static struct md_personality raid10_personality =
+{
+ .name = "raid10",
+ .level = 10,
+ .owner = THIS_MODULE,
+ .make_request = make_request,
+ .run = run,
+ .free = raid10_free,
+ .status = status,
+ .error_handler = error,
+ .hot_add_disk = raid10_add_disk,
+ .hot_remove_disk= raid10_remove_disk,
+ .spare_active = raid10_spare_active,
+ .sync_request = sync_request,
+ .quiesce = raid10_quiesce,
+ .size = raid10_size,
+ .resize = raid10_resize,
+ .takeover = raid10_takeover,
+ .check_reshape = raid10_check_reshape,
+ .start_reshape = raid10_start_reshape,
+ .finish_reshape = raid10_finish_reshape,
+ .congested = raid10_congested,
+ .mergeable_bvec = raid10_mergeable_bvec,
+};
+
+static int __init raid_init(void)
+{
+ return register_md_personality(&raid10_personality);
+}
+
+static void raid_exit(void)
+{
+ unregister_md_personality(&raid10_personality);
+}
+
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
+MODULE_ALIAS("md-personality-9"); /* RAID10 */
+MODULE_ALIAS("md-raid10");
+MODULE_ALIAS("md-level-10");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
new file mode 100644
index 000000000..5ee6473dd
--- /dev/null
+++ b/drivers/md/raid10.h
@@ -0,0 +1,153 @@
+#ifndef _RAID10_H
+#define _RAID10_H
+
+struct raid10_info {
+ struct md_rdev *rdev, *replacement;
+ sector_t head_position;
+ int recovery_disabled; /* matches
+ * mddev->recovery_disabled
+ * when we shouldn't try
+ * recovering this device.
+ */
+};
+
+struct r10conf {
+ struct mddev *mddev;
+ struct raid10_info *mirrors;
+ struct raid10_info *mirrors_new, *mirrors_old;
+ spinlock_t device_lock;
+
+ /* geometry */
+ struct geom {
+ int raid_disks;
+ int near_copies; /* number of copies laid out
+ * raid0 style */
+ int far_copies; /* number of copies laid out
+ * at large strides across drives
+ */
+ int far_offset; /* far_copies are offset by 1
+ * stripe instead of many
+ */
+ sector_t stride; /* distance between far copies.
+ * This is size / far_copies unless
+ * far_offset, in which case it is
+ * 1 stripe.
+ */
+ int far_set_size; /* The number of devices in a set,
+ * where a 'set' are devices that
+ * contain far/offset copies of
+ * each other.
+ */
+ int chunk_shift; /* shift from chunks to sectors */
+ sector_t chunk_mask;
+ } prev, geo;
+ int copies; /* near_copies * far_copies.
+ * must be <= raid_disks
+ */
+
+ sector_t dev_sectors; /* temp copy of
+ * mddev->dev_sectors */
+ sector_t reshape_progress;
+ sector_t reshape_safe;
+ unsigned long reshape_checkpoint;
+ sector_t offset_diff;
+
+ struct list_head retry_list;
+ /* queue pending writes and submit them on unplug */
+ struct bio_list pending_bio_list;
+ int pending_count;
+
+ spinlock_t resync_lock;
+ int nr_pending;
+ int nr_waiting;
+ int nr_queued;
+ int barrier;
+ sector_t next_resync;
+ int fullsync; /* set to 1 if a full sync is needed,
+ * (fresh device added).
+ * Cleared when a sync completes.
+ */
+ int have_replacement; /* There is at least one
+ * replacement device.
+ */
+ wait_queue_head_t wait_barrier;
+
+ mempool_t *r10bio_pool;
+ mempool_t *r10buf_pool;
+ struct page *tmppage;
+
+ /* When taking over an array from a different personality, we store
+ * the new thread here until we fully activate the array.
+ */
+ struct md_thread *thread;
+};
+
+/*
+ * this is our 'private' RAID10 bio.
+ *
+ * it contains information about what kind of IO operations were started
+ * for this RAID10 operation, and about their status:
+ */
+
+struct r10bio {
+ atomic_t remaining; /* 'have we finished' count,
+ * used from IRQ handlers
+ */
+ sector_t sector; /* virtual sector number */
+ int sectors;
+ unsigned long state;
+ struct mddev *mddev;
+ /*
+ * original bio going to /dev/mdx
+ */
+ struct bio *master_bio;
+ /*
+ * if the IO is in READ direction, then this is where we read
+ */
+ int read_slot;
+
+ struct list_head retry_list;
+ /*
+ * if the IO is in WRITE direction, then multiple bios are used,
+ * one for each copy.
+ * When resyncing we also use one for each copy.
+ * When reconstructing, we use 2 bios, one for read, one for write.
+ * We choose the number when they are allocated.
+ * We sometimes need an extra bio to write to the replacement.
+ */
+ struct r10dev {
+ struct bio *bio;
+ union {
+ struct bio *repl_bio; /* used for resync and
+ * writes */
+ struct md_rdev *rdev; /* used for reads
+ * (read_slot >= 0) */
+ };
+ sector_t addr;
+ int devnum;
+ } devs[0];
+};
+
+/* bits for r10bio.state */
+enum r10bio_state {
+ R10BIO_Uptodate,
+ R10BIO_IsSync,
+ R10BIO_IsRecover,
+ R10BIO_IsReshape,
+ R10BIO_Degraded,
+/* Set ReadError on bios that experience a read error
+ * so that raid10d knows what to do with them.
+ */
+ R10BIO_ReadError,
+/* If a write for this request means we can clear some
+ * known-bad-block records, we set this flag.
+ */
+ R10BIO_MadeGood,
+ R10BIO_WriteError,
+/* During a reshape we might be performing IO on the
+ * 'previous' part of the array, in which case this
+ * flag is set
+ */
+ R10BIO_Previous,
+};
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
new file mode 100644
index 000000000..b6793d2e0
--- /dev/null
+++ b/drivers/md/raid5.c
@@ -0,0 +1,7833 @@
+/*
+ * raid5.c : Multiple Devices driver for Linux
+ * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ * Copyright (C) 1999, 2000 Ingo Molnar
+ * Copyright (C) 2002, 2003 H. Peter Anvin
+ *
+ * RAID-4/5/6 management functions.
+ * Thanks to Penguin Computing for making the RAID-6 development possible
+ * by donating a test server!
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * BITMAP UNPLUGGING:
+ *
+ * The sequencing for updating the bitmap reliably is a little
+ * subtle (and I got it wrong the first time) so it deserves some
+ * explanation.
+ *
+ * We group bitmap updates into batches. Each batch has a number.
+ * We may write out several batches at once, but that isn't very important.
+ * conf->seq_write is the number of the last batch successfully written.
+ * conf->seq_flush is the number of the last batch that was closed to
+ * new additions.
+ * When we discover that we will need to write to any block in a stripe
+ * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
+ * the number of the batch it will be in. This is seq_flush+1.
+ * When we are ready to do a write, if that batch hasn't been written yet,
+ * we plug the array and queue the stripe for later.
+ * When an unplug happens, we increment bm_flush, thus closing the current
+ * batch.
+ * When we notice that bm_flush > bm_write, we write out all pending updates
+ * to the bitmap, and advance bm_write to where bm_flush was.
+ * This may occasionally write a bit out twice, but is sure never to
+ * miss any bits.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/async_tx.h>
+#include <linux/module.h>
+#include <linux/async.h>
+#include <linux/seq_file.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/nodemask.h>
+#include <linux/flex_array.h>
+#include <trace/events/block.h>
+
+#include "md.h"
+#include "raid5.h"
+#include "raid0.h"
+#include "bitmap.h"
+
+#define cpu_to_group(cpu) cpu_to_node(cpu)
+#define ANY_GROUP NUMA_NO_NODE
+
+static bool devices_handle_discard_safely = false;
+module_param(devices_handle_discard_safely, bool, 0644);
+MODULE_PARM_DESC(devices_handle_discard_safely,
+ "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
+static struct workqueue_struct *raid5_wq;
+/*
+ * Stripe cache
+ */
+
+#define NR_STRIPES 256
+#define STRIPE_SIZE PAGE_SIZE
+#define STRIPE_SHIFT (PAGE_SHIFT - 9)
+#define STRIPE_SECTORS (STRIPE_SIZE>>9)
+#define IO_THRESHOLD 1
+#define BYPASS_THRESHOLD 1
+#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
+#define HASH_MASK (NR_HASH - 1)
+#define MAX_STRIPE_BATCH 8
+
+static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
+{
+ int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
+ return &conf->stripe_hashtbl[hash];
+}
+
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+ return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+ spin_lock_irq(conf->hash_locks + hash);
+ spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+ spin_unlock(&conf->device_lock);
+ spin_unlock_irq(conf->hash_locks + hash);
+}
+
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+ int i;
+ local_irq_disable();
+ spin_lock(conf->hash_locks);
+ for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+ spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+ spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+ int i;
+ spin_unlock(&conf->device_lock);
+ for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+ spin_unlock(conf->hash_locks + i - 1);
+ local_irq_enable();
+}
+
+/* bio's attached to a stripe+device for I/O are linked together in bi_sector
+ * order without overlap. There may be several bio's per stripe+device, and
+ * a bio could span several devices.
+ * When walking this list for a particular stripe+device, we must never proceed
+ * beyond a bio that extends past this device, as the next bio might no longer
+ * be valid.
+ * This function is used to determine the 'next' bio in the list, given the sector
+ * of the current stripe+device
+ */
+static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
+{
+ int sectors = bio_sectors(bio);
+ if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
+ return bio->bi_next;
+ else
+ return NULL;
+}
+
+/*
+ * We maintain a biased count of active stripes in the bottom 16 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ */
+static inline int raid5_bi_processed_stripes(struct bio *bio)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ return (atomic_read(segments) >> 16) & 0xffff;
+}
+
+static inline int raid5_dec_bi_active_stripes(struct bio *bio)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ return atomic_sub_return(1, segments) & 0xffff;
+}
+
+static inline void raid5_inc_bi_active_stripes(struct bio *bio)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ atomic_inc(segments);
+}
+
+static inline void raid5_set_bi_processed_stripes(struct bio *bio,
+ unsigned int cnt)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ int old, new;
+
+ do {
+ old = atomic_read(segments);
+ new = (old & 0xffff) | (cnt << 16);
+ } while (atomic_cmpxchg(segments, old, new) != old);
+}
+
+static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
+{
+ atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
+ atomic_set(segments, cnt);
+}
+
+/* Find first data disk in a raid6 stripe */
+static inline int raid6_d0(struct stripe_head *sh)
+{
+ if (sh->ddf_layout)
+ /* ddf always start from first device */
+ return 0;
+ /* md starts just after Q block */
+ if (sh->qd_idx == sh->disks - 1)
+ return 0;
+ else
+ return sh->qd_idx + 1;
+}
+static inline int raid6_next_disk(int disk, int raid_disks)
+{
+ disk++;
+ return (disk < raid_disks) ? disk : 0;
+}
+
+/* When walking through the disks in a raid5, starting at raid6_d0,
+ * We need to map each disk to a 'slot', where the data disks are slot
+ * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
+ * is raid_disks-1. This help does that mapping.
+ */
+static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
+ int *count, int syndrome_disks)
+{
+ int slot = *count;
+
+ if (sh->ddf_layout)
+ (*count)++;
+ if (idx == sh->pd_idx)
+ return syndrome_disks;
+ if (idx == sh->qd_idx)
+ return syndrome_disks + 1;
+ if (!sh->ddf_layout)
+ (*count)++;
+ return slot;
+}
+
+static void return_io(struct bio *return_bi)
+{
+ struct bio *bi = return_bi;
+ while (bi) {
+
+ return_bi = bi->bi_next;
+ bi->bi_next = NULL;
+ bi->bi_iter.bi_size = 0;
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
+ bio_endio(bi, 0);
+ bi = return_bi;
+ }
+}
+
+static void print_raid5_conf (struct r5conf *conf);
+
+static int stripe_operations_active(struct stripe_head *sh)
+{
+ return sh->check_state || sh->reconstruct_state ||
+ test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
+ test_bit(STRIPE_COMPUTE_RUN, &sh->state);
+}
+
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+ struct r5conf *conf = sh->raid_conf;
+ struct r5worker_group *group;
+ int thread_cnt;
+ int i, cpu = sh->cpu;
+
+ if (!cpu_online(cpu)) {
+ cpu = cpumask_any(cpu_online_mask);
+ sh->cpu = cpu;
+ }
+
+ if (list_empty(&sh->lru)) {
+ struct r5worker_group *group;
+ group = conf->worker_groups + cpu_to_group(cpu);
+ list_add_tail(&sh->lru, &group->handle_list);
+ group->stripes_cnt++;
+ sh->group = group;
+ }
+
+ if (conf->worker_cnt_per_group == 0) {
+ md_wakeup_thread(conf->mddev->thread);
+ return;
+ }
+
+ group = conf->worker_groups + cpu_to_group(sh->cpu);
+
+ group->workers[0].working = true;
+ /* at least one worker should run to avoid race */
+ queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
+
+ thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
+ /* wakeup more workers */
+ for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
+ if (group->workers[i].working == false) {
+ group->workers[i].working = true;
+ queue_work_on(sh->cpu, raid5_wq,
+ &group->workers[i].work);
+ thread_cnt--;
+ }
+ }
+}
+
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+ struct list_head *temp_inactive_list)
+{
+ BUG_ON(!list_empty(&sh->lru));
+ BUG_ON(atomic_read(&conf->active_stripes)==0);
+ if (test_bit(STRIPE_HANDLE, &sh->state)) {
+ if (test_bit(STRIPE_DELAYED, &sh->state) &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ list_add_tail(&sh->lru, &conf->delayed_list);
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ sh->bm_seq - conf->seq_write > 0)
+ list_add_tail(&sh->lru, &conf->bitmap_list);
+ else {
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ clear_bit(STRIPE_BIT_DELAY, &sh->state);
+ if (conf->worker_cnt_per_group == 0) {
+ list_add_tail(&sh->lru, &conf->handle_list);
+ } else {
+ raid5_wakeup_stripe_thread(sh);
+ return;
+ }
+ }
+ md_wakeup_thread(conf->mddev->thread);
+ } else {
+ BUG_ON(stripe_operations_active(sh));
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ if (atomic_dec_return(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ atomic_dec(&conf->active_stripes);
+ if (!test_bit(STRIPE_EXPANDING, &sh->state))
+ list_add_tail(&sh->lru, temp_inactive_list);
+ }
+}
+
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+ struct list_head *temp_inactive_list)
+{
+ if (atomic_dec_and_test(&sh->count))
+ do_release_stripe(conf, sh, temp_inactive_list);
+}
+
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+ struct list_head *temp_inactive_list,
+ int hash)
+{
+ int size;
+ bool do_wakeup = false;
+ unsigned long flags;
+
+ if (hash == NR_STRIPE_HASH_LOCKS) {
+ size = NR_STRIPE_HASH_LOCKS;
+ hash = NR_STRIPE_HASH_LOCKS - 1;
+ } else
+ size = 1;
+ while (size) {
+ struct list_head *list = &temp_inactive_list[size - 1];
+
+ /*
+ * We don't hold any lock here yet, get_active_stripe() might
+ * remove stripes from the list
+ */
+ if (!list_empty_careful(list)) {
+ spin_lock_irqsave(conf->hash_locks + hash, flags);
+ if (list_empty(conf->inactive_list + hash) &&
+ !list_empty(list))
+ atomic_dec(&conf->empty_inactive_list_nr);
+ list_splice_tail_init(list, conf->inactive_list + hash);
+ do_wakeup = true;
+ spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+ }
+ size--;
+ hash--;
+ }
+
+ if (do_wakeup) {
+ wake_up(&conf->wait_for_stripe);
+ if (conf->retry_read_aligned)
+ md_wakeup_thread(conf->mddev->thread);
+ }
+}
+
+/* should hold conf->device_lock already */
+static int release_stripe_list(struct r5conf *conf,
+ struct list_head *temp_inactive_list)
+{
+ struct stripe_head *sh;
+ int count = 0;
+ struct llist_node *head;
+
+ head = llist_del_all(&conf->released_stripes);
+ head = llist_reverse_order(head);
+ while (head) {
+ int hash;
+
+ sh = llist_entry(head, struct stripe_head, release_list);
+ head = llist_next(head);
+ /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
+ smp_mb();
+ clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
+ /*
+ * Don't worry the bit is set here, because if the bit is set
+ * again, the count is always > 1. This is true for
+ * STRIPE_ON_UNPLUG_LIST bit too.
+ */
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &temp_inactive_list[hash]);
+ count++;
+ }
+
+ return count;
+}
+
+static void release_stripe(struct stripe_head *sh)
+{
+ struct r5conf *conf = sh->raid_conf;
+ unsigned long flags;
+ struct list_head list;
+ int hash;
+ bool wakeup;
+
+ /* Avoid release_list until the last reference.
+ */
+ if (atomic_add_unless(&sh->count, -1, 1))
+ return;
+
+ if (unlikely(!conf->mddev->thread) ||
+ test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
+ goto slow_path;
+ wakeup = llist_add(&sh->release_list, &conf->released_stripes);
+ if (wakeup)
+ md_wakeup_thread(conf->mddev->thread);
+ return;
+slow_path:
+ local_irq_save(flags);
+ /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
+ if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
+ INIT_LIST_HEAD(&list);
+ hash = sh->hash_lock_index;
+ do_release_stripe(conf, sh, &list);
+ spin_unlock(&conf->device_lock);
+ release_inactive_stripe_list(conf, &list, hash);
+ }
+ local_irq_restore(flags);
+}
+
+static inline void remove_hash(struct stripe_head *sh)
+{
+ pr_debug("remove_hash(), stripe %llu\n",
+ (unsigned long long)sh->sector);
+
+ hlist_del_init(&sh->hash);
+}
+
+static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
+{
+ struct hlist_head *hp = stripe_hash(conf, sh->sector);
+
+ pr_debug("insert_hash(), stripe %llu\n",
+ (unsigned long long)sh->sector);
+
+ hlist_add_head(&sh->hash, hp);
+}
+
+/* find an idle stripe, make sure it is unhashed, and return it. */
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
+{
+ struct stripe_head *sh = NULL;
+ struct list_head *first;
+
+ if (list_empty(conf->inactive_list + hash))
+ goto out;
+ first = (conf->inactive_list + hash)->next;
+ sh = list_entry(first, struct stripe_head, lru);
+ list_del_init(first);
+ remove_hash(sh);
+ atomic_inc(&conf->active_stripes);
+ BUG_ON(hash != sh->hash_lock_index);
+ if (list_empty(conf->inactive_list + hash))
+ atomic_inc(&conf->empty_inactive_list_nr);
+out:
+ return sh;
+}
+
+static void shrink_buffers(struct stripe_head *sh)
+{
+ struct page *p;
+ int i;
+ int num = sh->raid_conf->pool_size;
+
+ for (i = 0; i < num ; i++) {
+ WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
+ p = sh->dev[i].page;
+ if (!p)
+ continue;
+ sh->dev[i].page = NULL;
+ put_page(p);
+ }
+}
+
+static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
+{
+ int i;
+ int num = sh->raid_conf->pool_size;
+
+ for (i = 0; i < num; i++) {
+ struct page *page;
+
+ if (!(page = alloc_page(gfp))) {
+ return 1;
+ }
+ sh->dev[i].page = page;
+ sh->dev[i].orig_page = page;
+ }
+ return 0;
+}
+
+static void raid5_build_block(struct stripe_head *sh, int i, int previous);
+static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
+ struct stripe_head *sh);
+
+static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
+{
+ struct r5conf *conf = sh->raid_conf;
+ int i, seq;
+
+ BUG_ON(atomic_read(&sh->count) != 0);
+ BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
+ BUG_ON(stripe_operations_active(sh));
+ BUG_ON(sh->batch_head);
+
+ pr_debug("init_stripe called, stripe %llu\n",
+ (unsigned long long)sector);
+retry:
+ seq = read_seqcount_begin(&conf->gen_lock);
+ sh->generation = conf->generation - previous;
+ sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
+ sh->sector = sector;
+ stripe_set_idx(sector, conf, previous, sh);
+ sh->state = 0;
+
+ for (i = sh->disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+
+ if (dev->toread || dev->read || dev->towrite || dev->written ||
+ test_bit(R5_LOCKED, &dev->flags)) {
+ printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
+ (unsigned long long)sh->sector, i, dev->toread,
+ dev->read, dev->towrite, dev->written,
+ test_bit(R5_LOCKED, &dev->flags));
+ WARN_ON(1);
+ }
+ dev->flags = 0;
+ raid5_build_block(sh, i, previous);
+ }
+ if (read_seqcount_retry(&conf->gen_lock, seq))
+ goto retry;
+ sh->overwrite_disks = 0;
+ insert_hash(conf, sh);
+ sh->cpu = smp_processor_id();
+ set_bit(STRIPE_BATCH_READY, &sh->state);
+}
+
+static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
+ short generation)
+{
+ struct stripe_head *sh;
+
+ pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
+ hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
+ if (sh->sector == sector && sh->generation == generation)
+ return sh;
+ pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
+ return NULL;
+}
+
+/*
+ * Need to check if array has failed when deciding whether to:
+ * - start an array
+ * - remove non-faulty devices
+ * - add a spare
+ * - allow a reshape
+ * This determination is simple when no reshape is happening.
+ * However if there is a reshape, we need to carefully check
+ * both the before and after sections.
+ * This is because some failed devices may only affect one
+ * of the two sections, and some non-in_sync devices may
+ * be insync in the section most affected by failed devices.
+ */
+static int calc_degraded(struct r5conf *conf)
+{
+ int degraded, degraded2;
+ int i;
+
+ rcu_read_lock();
+ degraded = 0;
+ for (i = 0; i < conf->previous_raid_disks; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev && test_bit(Faulty, &rdev->flags))
+ rdev = rcu_dereference(conf->disks[i].replacement);
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ degraded++;
+ else if (test_bit(In_sync, &rdev->flags))
+ ;
+ else
+ /* not in-sync or faulty.
+ * If the reshape increases the number of devices,
+ * this is being recovered by the reshape, so
+ * this 'previous' section is not in_sync.
+ * If the number of devices is being reduced however,
+ * the device can only be part of the array if
+ * we are reverting a reshape, so this section will
+ * be in-sync.
+ */
+ if (conf->raid_disks >= conf->previous_raid_disks)
+ degraded++;
+ }
+ rcu_read_unlock();
+ if (conf->raid_disks == conf->previous_raid_disks)
+ return degraded;
+ rcu_read_lock();
+ degraded2 = 0;
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev && test_bit(Faulty, &rdev->flags))
+ rdev = rcu_dereference(conf->disks[i].replacement);
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ degraded2++;
+ else if (test_bit(In_sync, &rdev->flags))
+ ;
+ else
+ /* not in-sync or faulty.
+ * If reshape increases the number of devices, this
+ * section has already been recovered, else it
+ * almost certainly hasn't.
+ */
+ if (conf->raid_disks <= conf->previous_raid_disks)
+ degraded2++;
+ }
+ rcu_read_unlock();
+ if (degraded2 > degraded)
+ return degraded2;
+ return degraded;
+}
+
+static int has_failed(struct r5conf *conf)
+{
+ int degraded;
+
+ if (conf->mddev->reshape_position == MaxSector)
+ return conf->mddev->degraded > conf->max_degraded;
+
+ degraded = calc_degraded(conf);
+ if (degraded > conf->max_degraded)
+ return 1;
+ return 0;
+}
+
+static struct stripe_head *
+get_active_stripe(struct r5conf *conf, sector_t sector,
+ int previous, int noblock, int noquiesce)
+{
+ struct stripe_head *sh;
+ int hash = stripe_hash_locks_hash(sector);
+
+ pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
+
+ spin_lock_irq(conf->hash_locks + hash);
+
+ do {
+ wait_event_lock_irq(conf->wait_for_stripe,
+ conf->quiesce == 0 || noquiesce,
+ *(conf->hash_locks + hash));
+ sh = __find_stripe(conf, sector, conf->generation - previous);
+ if (!sh) {
+ if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
+ sh = get_free_stripe(conf, hash);
+ if (!sh && llist_empty(&conf->released_stripes) &&
+ !test_bit(R5_DID_ALLOC, &conf->cache_state))
+ set_bit(R5_ALLOC_MORE,
+ &conf->cache_state);
+ }
+ if (noblock && sh == NULL)
+ break;
+ if (!sh) {
+ set_bit(R5_INACTIVE_BLOCKED,
+ &conf->cache_state);
+ wait_event_lock_irq(
+ conf->wait_for_stripe,
+ !list_empty(conf->inactive_list + hash) &&
+ (atomic_read(&conf->active_stripes)
+ < (conf->max_nr_stripes * 3 / 4)
+ || !test_bit(R5_INACTIVE_BLOCKED,
+ &conf->cache_state)),
+ *(conf->hash_locks + hash));
+ clear_bit(R5_INACTIVE_BLOCKED,
+ &conf->cache_state);
+ } else {
+ init_stripe(sh, sector, previous);
+ atomic_inc(&sh->count);
+ }
+ } else if (!atomic_inc_not_zero(&sh->count)) {
+ spin_lock(&conf->device_lock);
+ if (!atomic_read(&sh->count)) {
+ if (!test_bit(STRIPE_HANDLE, &sh->state))
+ atomic_inc(&conf->active_stripes);
+ BUG_ON(list_empty(&sh->lru) &&
+ !test_bit(STRIPE_EXPANDING, &sh->state));
+ list_del_init(&sh->lru);
+ if (sh->group) {
+ sh->group->stripes_cnt--;
+ sh->group = NULL;
+ }
+ }
+ atomic_inc(&sh->count);
+ spin_unlock(&conf->device_lock);
+ }
+ } while (sh == NULL);
+
+ spin_unlock_irq(conf->hash_locks + hash);
+ return sh;
+}
+
+static bool is_full_stripe_write(struct stripe_head *sh)
+{
+ BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
+ return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
+}
+
+static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+ local_irq_disable();
+ if (sh1 > sh2) {
+ spin_lock(&sh2->stripe_lock);
+ spin_lock_nested(&sh1->stripe_lock, 1);
+ } else {
+ spin_lock(&sh1->stripe_lock);
+ spin_lock_nested(&sh2->stripe_lock, 1);
+ }
+}
+
+static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+ spin_unlock(&sh1->stripe_lock);
+ spin_unlock(&sh2->stripe_lock);
+ local_irq_enable();
+}
+
+/* Only freshly new full stripe normal write stripe can be added to a batch list */
+static bool stripe_can_batch(struct stripe_head *sh)
+{
+ return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+ !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
+ is_full_stripe_write(sh);
+}
+
+/* we only do back search */
+static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+{
+ struct stripe_head *head;
+ sector_t head_sector, tmp_sec;
+ int hash;
+ int dd_idx;
+
+ if (!stripe_can_batch(sh))
+ return;
+ /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
+ tmp_sec = sh->sector;
+ if (!sector_div(tmp_sec, conf->chunk_sectors))
+ return;
+ head_sector = sh->sector - STRIPE_SECTORS;
+
+ hash = stripe_hash_locks_hash(head_sector);
+ spin_lock_irq(conf->hash_locks + hash);
+ head = __find_stripe(conf, head_sector, conf->generation);
+ if (head && !atomic_inc_not_zero(&head->count)) {
+ spin_lock(&conf->device_lock);
+ if (!atomic_read(&head->count)) {
+ if (!test_bit(STRIPE_HANDLE, &head->state))
+ atomic_inc(&conf->active_stripes);
+ BUG_ON(list_empty(&head->lru) &&
+ !test_bit(STRIPE_EXPANDING, &head->state));
+ list_del_init(&head->lru);
+ if (head->group) {
+ head->group->stripes_cnt--;
+ head->group = NULL;
+ }
+ }
+ atomic_inc(&head->count);
+ spin_unlock(&conf->device_lock);
+ }
+ spin_unlock_irq(conf->hash_locks + hash);
+
+ if (!head)
+ return;
+ if (!stripe_can_batch(head))
+ goto out;
+
+ lock_two_stripes(head, sh);
+ /* clear_batch_ready clear the flag */
+ if (!stripe_can_batch(head) || !stripe_can_batch(sh))
+ goto unlock_out;
+
+ if (sh->batch_head)
+ goto unlock_out;
+
+ dd_idx = 0;
+ while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+ dd_idx++;
+ if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
+ goto unlock_out;
+
+ if (head->batch_head) {
+ spin_lock(&head->batch_head->batch_lock);
+ /* This batch list is already running */
+ if (!stripe_can_batch(head)) {
+ spin_unlock(&head->batch_head->batch_lock);
+ goto unlock_out;
+ }
+
+ /*
+ * at this point, head's BATCH_READY could be cleared, but we
+ * can still add the stripe to batch list
+ */
+ list_add(&sh->batch_list, &head->batch_list);
+ spin_unlock(&head->batch_head->batch_lock);
+
+ sh->batch_head = head->batch_head;
+ } else {
+ head->batch_head = head;
+ sh->batch_head = head->batch_head;
+ spin_lock(&head->batch_lock);
+ list_add_tail(&sh->batch_list, &head->batch_list);
+ spin_unlock(&head->batch_lock);
+ }
+
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ if (atomic_dec_return(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+
+ if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
+ int seq = sh->bm_seq;
+ if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
+ sh->batch_head->bm_seq > seq)
+ seq = sh->batch_head->bm_seq;
+ set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
+ sh->batch_head->bm_seq = seq;
+ }
+
+ atomic_inc(&sh->count);
+unlock_out:
+ unlock_two_stripes(head, sh);
+out:
+ release_stripe(head);
+}
+
+/* Determine if 'data_offset' or 'new_data_offset' should be used
+ * in this stripe_head.
+ */
+static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
+{
+ sector_t progress = conf->reshape_progress;
+ /* Need a memory barrier to make sure we see the value
+ * of conf->generation, or ->data_offset that was set before
+ * reshape_progress was updated.
+ */
+ smp_rmb();
+ if (progress == MaxSector)
+ return 0;
+ if (sh->generation == conf->generation - 1)
+ return 0;
+ /* We are in a reshape, and this is a new-generation stripe,
+ * so use new_data_offset.
+ */
+ return 1;
+}
+
+static void
+raid5_end_read_request(struct bio *bi, int error);
+static void
+raid5_end_write_request(struct bio *bi, int error);
+
+static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
+{
+ struct r5conf *conf = sh->raid_conf;
+ int i, disks = sh->disks;
+ struct stripe_head *head_sh = sh;
+
+ might_sleep();
+
+ for (i = disks; i--; ) {
+ int rw;
+ int replace_only = 0;
+ struct bio *bi, *rbi;
+ struct md_rdev *rdev, *rrdev = NULL;
+
+ sh = head_sh;
+ if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
+ if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
+ rw = WRITE_FUA;
+ else
+ rw = WRITE;
+ if (test_bit(R5_Discard, &sh->dev[i].flags))
+ rw |= REQ_DISCARD;
+ } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+ rw = READ;
+ else if (test_and_clear_bit(R5_WantReplace,
+ &sh->dev[i].flags)) {
+ rw = WRITE;
+ replace_only = 1;
+ } else
+ continue;
+ if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
+ rw |= REQ_SYNC;
+
+again:
+ bi = &sh->dev[i].req;
+ rbi = &sh->dev[i].rreq; /* For writing to replacement */
+
+ rcu_read_lock();
+ rrdev = rcu_dereference(conf->disks[i].replacement);
+ smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ if (!rdev) {
+ rdev = rrdev;
+ rrdev = NULL;
+ }
+ if (rw & WRITE) {
+ if (replace_only)
+ rdev = NULL;
+ if (rdev == rrdev)
+ /* We raced and saw duplicates */
+ rrdev = NULL;
+ } else {
+ if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
+ rdev = rrdev;
+ rrdev = NULL;
+ }
+
+ if (rdev && test_bit(Faulty, &rdev->flags))
+ rdev = NULL;
+ if (rdev)
+ atomic_inc(&rdev->nr_pending);
+ if (rrdev && test_bit(Faulty, &rrdev->flags))
+ rrdev = NULL;
+ if (rrdev)
+ atomic_inc(&rrdev->nr_pending);
+ rcu_read_unlock();
+
+ /* We have already checked bad blocks for reads. Now
+ * need to check for writes. We never accept write errors
+ * on the replacement, so we don't to check rrdev.
+ */
+ while ((rw & WRITE) && rdev &&
+ test_bit(WriteErrorSeen, &rdev->flags)) {
+ sector_t first_bad;
+ int bad_sectors;
+ int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ &first_bad, &bad_sectors);
+ if (!bad)
+ break;
+
+ if (bad < 0) {
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ if (!conf->mddev->external &&
+ conf->mddev->flags) {
+ /* It is very unlikely, but we might
+ * still need to write out the
+ * bad block log - better give it
+ * a chance*/
+ md_check_recovery(conf->mddev);
+ }
+ /*
+ * Because md_wait_for_blocked_rdev
+ * will dec nr_pending, we must
+ * increment it first.
+ */
+ atomic_inc(&rdev->nr_pending);
+ md_wait_for_blocked_rdev(rdev, conf->mddev);
+ } else {
+ /* Acknowledged bad block - skip the write */
+ rdev_dec_pending(rdev, conf->mddev);
+ rdev = NULL;
+ }
+ }
+
+ if (rdev) {
+ if (s->syncing || s->expanding || s->expanded
+ || s->replacing)
+ md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+
+ set_bit(STRIPE_IO_STARTED, &sh->state);
+
+ bio_reset(bi);
+ bi->bi_bdev = rdev->bdev;
+ bi->bi_rw = rw;
+ bi->bi_end_io = (rw & WRITE)
+ ? raid5_end_write_request
+ : raid5_end_read_request;
+ bi->bi_private = sh;
+
+ pr_debug("%s: for %llu schedule op %ld on disc %d\n",
+ __func__, (unsigned long long)sh->sector,
+ bi->bi_rw, i);
+ atomic_inc(&sh->count);
+ if (sh != head_sh)
+ atomic_inc(&head_sh->count);
+ if (use_new_offset(conf, sh))
+ bi->bi_iter.bi_sector = (sh->sector
+ + rdev->new_data_offset);
+ else
+ bi->bi_iter.bi_sector = (sh->sector
+ + rdev->data_offset);
+ if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
+ bi->bi_rw |= REQ_NOMERGE;
+
+ if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].vec.bv_page = sh->dev[i].page;
+ bi->bi_vcnt = 1;
+ bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+ bi->bi_io_vec[0].bv_offset = 0;
+ bi->bi_iter.bi_size = STRIPE_SIZE;
+ /*
+ * If this is discard request, set bi_vcnt 0. We don't
+ * want to confuse SCSI because SCSI will replace payload
+ */
+ if (rw & REQ_DISCARD)
+ bi->bi_vcnt = 0;
+ if (rrdev)
+ set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+ bi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
+ generic_make_request(bi);
+ }
+ if (rrdev) {
+ if (s->syncing || s->expanding || s->expanded
+ || s->replacing)
+ md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
+
+ set_bit(STRIPE_IO_STARTED, &sh->state);
+
+ bio_reset(rbi);
+ rbi->bi_bdev = rrdev->bdev;
+ rbi->bi_rw = rw;
+ BUG_ON(!(rw & WRITE));
+ rbi->bi_end_io = raid5_end_write_request;
+ rbi->bi_private = sh;
+
+ pr_debug("%s: for %llu schedule op %ld on "
+ "replacement disc %d\n",
+ __func__, (unsigned long long)sh->sector,
+ rbi->bi_rw, i);
+ atomic_inc(&sh->count);
+ if (sh != head_sh)
+ atomic_inc(&head_sh->count);
+ if (use_new_offset(conf, sh))
+ rbi->bi_iter.bi_sector = (sh->sector
+ + rrdev->new_data_offset);
+ else
+ rbi->bi_iter.bi_sector = (sh->sector
+ + rrdev->data_offset);
+ if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].rvec.bv_page = sh->dev[i].page;
+ rbi->bi_vcnt = 1;
+ rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+ rbi->bi_io_vec[0].bv_offset = 0;
+ rbi->bi_iter.bi_size = STRIPE_SIZE;
+ /*
+ * If this is discard request, set bi_vcnt 0. We don't
+ * want to confuse SCSI because SCSI will replace payload
+ */
+ if (rw & REQ_DISCARD)
+ rbi->bi_vcnt = 0;
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+ rbi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
+ generic_make_request(rbi);
+ }
+ if (!rdev && !rrdev) {
+ if (rw & WRITE)
+ set_bit(STRIPE_DEGRADED, &sh->state);
+ pr_debug("skip op %ld on disc %d for sector %llu\n",
+ bi->bi_rw, i, (unsigned long long)sh->sector);
+ clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+
+ if (!head_sh->batch_head)
+ continue;
+ sh = list_first_entry(&sh->batch_list, struct stripe_head,
+ batch_list);
+ if (sh != head_sh)
+ goto again;
+ }
+}
+
+static struct dma_async_tx_descriptor *
+async_copy_data(int frombio, struct bio *bio, struct page **page,
+ sector_t sector, struct dma_async_tx_descriptor *tx,
+ struct stripe_head *sh)
+{
+ struct bio_vec bvl;
+ struct bvec_iter iter;
+ struct page *bio_page;
+ int page_offset;
+ struct async_submit_ctl submit;
+ enum async_tx_flags flags = 0;
+
+ if (bio->bi_iter.bi_sector >= sector)
+ page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
+ else
+ page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
+
+ if (frombio)
+ flags |= ASYNC_TX_FENCE;
+ init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
+
+ bio_for_each_segment(bvl, bio, iter) {
+ int len = bvl.bv_len;
+ int clen;
+ int b_offset = 0;
+
+ if (page_offset < 0) {
+ b_offset = -page_offset;
+ page_offset += b_offset;
+ len -= b_offset;
+ }
+
+ if (len > 0 && page_offset + len > STRIPE_SIZE)
+ clen = STRIPE_SIZE - page_offset;
+ else
+ clen = len;
+
+ if (clen > 0) {
+ b_offset += bvl.bv_offset;
+ bio_page = bvl.bv_page;
+ if (frombio) {
+ if (sh->raid_conf->skip_copy &&
+ b_offset == 0 && page_offset == 0 &&
+ clen == STRIPE_SIZE)
+ *page = bio_page;
+ else
+ tx = async_memcpy(*page, bio_page, page_offset,
+ b_offset, clen, &submit);
+ } else
+ tx = async_memcpy(bio_page, *page, b_offset,
+ page_offset, clen, &submit);
+ }
+ /* chain the operations */
+ submit.depend_tx = tx;
+
+ if (clen < len) /* hit end of page */
+ break;
+ page_offset += len;
+ }
+
+ return tx;
+}
+
+static void ops_complete_biofill(void *stripe_head_ref)
+{
+ struct stripe_head *sh = stripe_head_ref;
+ struct bio *return_bi = NULL;
+ int i;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ /* clear completed biofills */
+ for (i = sh->disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+
+ /* acknowledge completion of a biofill operation */
+ /* and check if we need to reply to a read request,
+ * new R5_Wantfill requests are held off until
+ * !STRIPE_BIOFILL_RUN
+ */
+ if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
+ struct bio *rbi, *rbi2;
+
+ BUG_ON(!dev->read);
+ rbi = dev->read;
+ dev->read = NULL;
+ while (rbi && rbi->bi_iter.bi_sector <
+ dev->sector + STRIPE_SECTORS) {
+ rbi2 = r5_next_bio(rbi, dev->sector);
+ if (!raid5_dec_bi_active_stripes(rbi)) {
+ rbi->bi_next = return_bi;
+ return_bi = rbi;
+ }
+ rbi = rbi2;
+ }
+ }
+ }
+ clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
+
+ return_io(return_bi);
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+}
+
+static void ops_run_biofill(struct stripe_head *sh)
+{
+ struct dma_async_tx_descriptor *tx = NULL;
+ struct async_submit_ctl submit;
+ int i;
+
+ BUG_ON(sh->batch_head);
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ for (i = sh->disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_Wantfill, &dev->flags)) {
+ struct bio *rbi;
+ spin_lock_irq(&sh->stripe_lock);
+ dev->read = rbi = dev->toread;
+ dev->toread = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
+ while (rbi && rbi->bi_iter.bi_sector <
+ dev->sector + STRIPE_SECTORS) {
+ tx = async_copy_data(0, rbi, &dev->page,
+ dev->sector, tx, sh);
+ rbi = r5_next_bio(rbi, dev->sector);
+ }
+ }
+ }
+
+ atomic_inc(&sh->count);
+ init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
+ async_trigger_callback(&submit);
+}
+
+static void mark_target_uptodate(struct stripe_head *sh, int target)
+{
+ struct r5dev *tgt;
+
+ if (target < 0)
+ return;
+
+ tgt = &sh->dev[target];
+ set_bit(R5_UPTODATE, &tgt->flags);
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+ clear_bit(R5_Wantcompute, &tgt->flags);
+}
+
+static void ops_complete_compute(void *stripe_head_ref)
+{
+ struct stripe_head *sh = stripe_head_ref;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ /* mark the computed target(s) as uptodate */
+ mark_target_uptodate(sh, sh->ops.target);
+ mark_target_uptodate(sh, sh->ops.target2);
+
+ clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ if (sh->check_state == check_state_compute_run)
+ sh->check_state = check_state_compute_result;
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+}
+
+/* return a pointer to the address conversion region of the scribble buffer */
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
+ struct raid5_percpu *percpu, int i)
+{
+ void *addr;
+
+ addr = flex_array_get(percpu->scribble, i);
+ return addr + sizeof(struct page *) * (sh->disks + 2);
+}
+
+/* return a pointer to the address conversion region of the scribble buffer */
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
+{
+ void *addr;
+
+ addr = flex_array_get(percpu->scribble, i);
+ return addr;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+ int disks = sh->disks;
+ struct page **xor_srcs = to_addr_page(percpu, 0);
+ int target = sh->ops.target;
+ struct r5dev *tgt = &sh->dev[target];
+ struct page *xor_dest = tgt->page;
+ int count = 0;
+ struct dma_async_tx_descriptor *tx;
+ struct async_submit_ctl submit;
+ int i;
+
+ BUG_ON(sh->batch_head);
+
+ pr_debug("%s: stripe %llu block: %d\n",
+ __func__, (unsigned long long)sh->sector, target);
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+
+ for (i = disks; i--; )
+ if (i != target)
+ xor_srcs[count++] = sh->dev[i].page;
+
+ atomic_inc(&sh->count);
+
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
+ ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
+ if (unlikely(count == 1))
+ tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+ else
+ tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+
+ return tx;
+}
+
+/* set_syndrome_sources - populate source buffers for gen_syndrome
+ * @srcs - (struct page *) array of size sh->disks
+ * @sh - stripe_head to parse
+ *
+ * Populates srcs in proper layout order for the stripe and returns the
+ * 'count' of sources to be used in a call to async_gen_syndrome. The P
+ * destination buffer is recorded in srcs[count] and the Q destination
+ * is recorded in srcs[count+1]].
+ */
+static int set_syndrome_sources(struct page **srcs,
+ struct stripe_head *sh,
+ int srctype)
+{
+ int disks = sh->disks;
+ int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
+ int d0_idx = raid6_d0(sh);
+ int count;
+ int i;
+
+ for (i = 0; i < disks; i++)
+ srcs[i] = NULL;
+
+ count = 0;
+ i = d0_idx;
+ do {
+ int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+ struct r5dev *dev = &sh->dev[i];
+
+ if (i == sh->qd_idx || i == sh->pd_idx ||
+ (srctype == SYNDROME_SRC_ALL) ||
+ (srctype == SYNDROME_SRC_WANT_DRAIN &&
+ test_bit(R5_Wantdrain, &dev->flags)) ||
+ (srctype == SYNDROME_SRC_WRITTEN &&
+ dev->written))
+ srcs[slot] = sh->dev[i].page;
+ i = raid6_next_disk(i, disks);
+ } while (i != d0_idx);
+
+ return syndrome_disks;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+ int disks = sh->disks;
+ struct page **blocks = to_addr_page(percpu, 0);
+ int target;
+ int qd_idx = sh->qd_idx;
+ struct dma_async_tx_descriptor *tx;
+ struct async_submit_ctl submit;
+ struct r5dev *tgt;
+ struct page *dest;
+ int i;
+ int count;
+
+ BUG_ON(sh->batch_head);
+ if (sh->ops.target < 0)
+ target = sh->ops.target2;
+ else if (sh->ops.target2 < 0)
+ target = sh->ops.target;
+ else
+ /* we should only have one valid target */
+ BUG();
+ BUG_ON(target < 0);
+ pr_debug("%s: stripe %llu block: %d\n",
+ __func__, (unsigned long long)sh->sector, target);
+
+ tgt = &sh->dev[target];
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+ dest = tgt->page;
+
+ atomic_inc(&sh->count);
+
+ if (target == qd_idx) {
+ count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
+ blocks[count] = NULL; /* regenerating p is not necessary */
+ BUG_ON(blocks[count+1] != dest); /* q should already be set */
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+ ops_complete_compute, sh,
+ to_addr_conv(sh, percpu, 0));
+ tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ } else {
+ /* Compute any data- or p-drive using XOR */
+ count = 0;
+ for (i = disks; i-- ; ) {
+ if (i == target || i == qd_idx)
+ continue;
+ blocks[count++] = sh->dev[i].page;
+ }
+
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+ NULL, ops_complete_compute, sh,
+ to_addr_conv(sh, percpu, 0));
+ tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
+ }
+
+ return tx;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+ int i, count, disks = sh->disks;
+ int syndrome_disks = sh->ddf_layout ? disks : disks-2;
+ int d0_idx = raid6_d0(sh);
+ int faila = -1, failb = -1;
+ int target = sh->ops.target;
+ int target2 = sh->ops.target2;
+ struct r5dev *tgt = &sh->dev[target];
+ struct r5dev *tgt2 = &sh->dev[target2];
+ struct dma_async_tx_descriptor *tx;
+ struct page **blocks = to_addr_page(percpu, 0);
+ struct async_submit_ctl submit;
+
+ BUG_ON(sh->batch_head);
+ pr_debug("%s: stripe %llu block1: %d block2: %d\n",
+ __func__, (unsigned long long)sh->sector, target, target2);
+ BUG_ON(target < 0 || target2 < 0);
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+ BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
+
+ /* we need to open-code set_syndrome_sources to handle the
+ * slot number conversion for 'faila' and 'failb'
+ */
+ for (i = 0; i < disks ; i++)
+ blocks[i] = NULL;
+ count = 0;
+ i = d0_idx;
+ do {
+ int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+ blocks[slot] = sh->dev[i].page;
+
+ if (i == target)
+ faila = slot;
+ if (i == target2)
+ failb = slot;
+ i = raid6_next_disk(i, disks);
+ } while (i != d0_idx);
+
+ BUG_ON(faila == failb);
+ if (failb < faila)
+ swap(faila, failb);
+ pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
+ __func__, (unsigned long long)sh->sector, faila, failb);
+
+ atomic_inc(&sh->count);
+
+ if (failb == syndrome_disks+1) {
+ /* Q disk is one of the missing disks */
+ if (faila == syndrome_disks) {
+ /* Missing P+Q, just recompute */
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+ ops_complete_compute, sh,
+ to_addr_conv(sh, percpu, 0));
+ return async_gen_syndrome(blocks, 0, syndrome_disks+2,
+ STRIPE_SIZE, &submit);
+ } else {
+ struct page *dest;
+ int data_target;
+ int qd_idx = sh->qd_idx;
+
+ /* Missing D+Q: recompute D from P, then recompute Q */
+ if (target == qd_idx)
+ data_target = target2;
+ else
+ data_target = target;
+
+ count = 0;
+ for (i = disks; i-- ; ) {
+ if (i == data_target || i == qd_idx)
+ continue;
+ blocks[count++] = sh->dev[i].page;
+ }
+ dest = sh->dev[data_target].page;
+ init_async_submit(&submit,
+ ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+ NULL, NULL, NULL,
+ to_addr_conv(sh, percpu, 0));
+ tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
+ &submit);
+
+ count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
+ init_async_submit(&submit, ASYNC_TX_FENCE, tx,
+ ops_complete_compute, sh,
+ to_addr_conv(sh, percpu, 0));
+ return async_gen_syndrome(blocks, 0, count+2,
+ STRIPE_SIZE, &submit);
+ }
+ } else {
+ init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+ ops_complete_compute, sh,
+ to_addr_conv(sh, percpu, 0));
+ if (failb == syndrome_disks) {
+ /* We're missing D+P. */
+ return async_raid6_datap_recov(syndrome_disks+2,
+ STRIPE_SIZE, faila,
+ blocks, &submit);
+ } else {
+ /* We're missing D+D. */
+ return async_raid6_2data_recov(syndrome_disks+2,
+ STRIPE_SIZE, faila, failb,
+ blocks, &submit);
+ }
+ }
+}
+
+static void ops_complete_prexor(void *stripe_head_ref)
+{
+ struct stripe_head *sh = stripe_head_ref;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ int disks = sh->disks;
+ struct page **xor_srcs = to_addr_page(percpu, 0);
+ int count = 0, pd_idx = sh->pd_idx, i;
+ struct async_submit_ctl submit;
+
+ /* existing parity data subtracted */
+ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+
+ BUG_ON(sh->batch_head);
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ /* Only process blocks that are known to be uptodate */
+ if (test_bit(R5_Wantdrain, &dev->flags))
+ xor_srcs[count++] = dev->page;
+ }
+
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+ ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
+ tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+
+ return tx;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ struct page **blocks = to_addr_page(percpu, 0);
+ int count;
+ struct async_submit_ctl submit;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
+ ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
+ tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+
+ return tx;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+{
+ int disks = sh->disks;
+ int i;
+ struct stripe_head *head_sh = sh;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ for (i = disks; i--; ) {
+ struct r5dev *dev;
+ struct bio *chosen;
+
+ sh = head_sh;
+ if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
+ struct bio *wbi;
+
+again:
+ dev = &sh->dev[i];
+ spin_lock_irq(&sh->stripe_lock);
+ chosen = dev->towrite;
+ dev->towrite = NULL;
+ sh->overwrite_disks = 0;
+ BUG_ON(dev->written);
+ wbi = dev->written = chosen;
+ spin_unlock_irq(&sh->stripe_lock);
+ WARN_ON(dev->page != dev->orig_page);
+
+ while (wbi && wbi->bi_iter.bi_sector <
+ dev->sector + STRIPE_SECTORS) {
+ if (wbi->bi_rw & REQ_FUA)
+ set_bit(R5_WantFUA, &dev->flags);
+ if (wbi->bi_rw & REQ_SYNC)
+ set_bit(R5_SyncIO, &dev->flags);
+ if (wbi->bi_rw & REQ_DISCARD)
+ set_bit(R5_Discard, &dev->flags);
+ else {
+ tx = async_copy_data(1, wbi, &dev->page,
+ dev->sector, tx, sh);
+ if (dev->page != dev->orig_page) {
+ set_bit(R5_SkipCopy, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ clear_bit(R5_OVERWRITE, &dev->flags);
+ }
+ }
+ wbi = r5_next_bio(wbi, dev->sector);
+ }
+
+ if (head_sh->batch_head) {
+ sh = list_first_entry(&sh->batch_list,
+ struct stripe_head,
+ batch_list);
+ if (sh == head_sh)
+ continue;
+ goto again;
+ }
+ }
+ }
+
+ return tx;
+}
+
+static void ops_complete_reconstruct(void *stripe_head_ref)
+{
+ struct stripe_head *sh = stripe_head_ref;
+ int disks = sh->disks;
+ int pd_idx = sh->pd_idx;
+ int qd_idx = sh->qd_idx;
+ int i;
+ bool fua = false, sync = false, discard = false;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ for (i = disks; i--; ) {
+ fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
+ sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+ discard |= test_bit(R5_Discard, &sh->dev[i].flags);
+ }
+
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+
+ if (dev->written || i == pd_idx || i == qd_idx) {
+ if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
+ set_bit(R5_UPTODATE, &dev->flags);
+ if (fua)
+ set_bit(R5_WantFUA, &dev->flags);
+ if (sync)
+ set_bit(R5_SyncIO, &dev->flags);
+ }
+ }
+
+ if (sh->reconstruct_state == reconstruct_state_drain_run)
+ sh->reconstruct_state = reconstruct_state_drain_result;
+ else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
+ sh->reconstruct_state = reconstruct_state_prexor_drain_result;
+ else {
+ BUG_ON(sh->reconstruct_state != reconstruct_state_run);
+ sh->reconstruct_state = reconstruct_state_result;
+ }
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+}
+
+static void
+ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ int disks = sh->disks;
+ struct page **xor_srcs;
+ struct async_submit_ctl submit;
+ int count, pd_idx = sh->pd_idx, i;
+ struct page *xor_dest;
+ int prexor = 0;
+ unsigned long flags;
+ int j = 0;
+ struct stripe_head *head_sh = sh;
+ int last_stripe;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ for (i = 0; i < sh->disks; i++) {
+ if (pd_idx == i)
+ continue;
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
+ break;
+ }
+ if (i >= sh->disks) {
+ atomic_inc(&sh->count);
+ set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+ ops_complete_reconstruct(sh);
+ return;
+ }
+again:
+ count = 0;
+ xor_srcs = to_addr_page(percpu, j);
+ /* check if prexor is active which means only process blocks
+ * that are part of a read-modify-write (written)
+ */
+ if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ prexor = 1;
+ xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if (head_sh->dev[i].written)
+ xor_srcs[count++] = dev->page;
+ }
+ } else {
+ xor_dest = sh->dev[pd_idx].page;
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if (i != pd_idx)
+ xor_srcs[count++] = dev->page;
+ }
+ }
+
+ /* 1/ if we prexor'd then the dest is reused as a source
+ * 2/ if we did not prexor then we are redoing the parity
+ * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
+ * for the synchronous xor case
+ */
+ last_stripe = !head_sh->batch_head ||
+ list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list) == head_sh;
+ if (last_stripe) {
+ flags = ASYNC_TX_ACK |
+ (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
+
+ atomic_inc(&head_sh->count);
+ init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
+ to_addr_conv(sh, percpu, j));
+ } else {
+ flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
+ init_async_submit(&submit, flags, tx, NULL, NULL,
+ to_addr_conv(sh, percpu, j));
+ }
+
+ if (unlikely(count == 1))
+ tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+ else
+ tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+ if (!last_stripe) {
+ j++;
+ sh = list_first_entry(&sh->batch_list, struct stripe_head,
+ batch_list);
+ goto again;
+ }
+}
+
+static void
+ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ struct async_submit_ctl submit;
+ struct page **blocks;
+ int count, i, j = 0;
+ struct stripe_head *head_sh = sh;
+ int last_stripe;
+ int synflags;
+ unsigned long txflags;
+
+ pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+ for (i = 0; i < sh->disks; i++) {
+ if (sh->pd_idx == i || sh->qd_idx == i)
+ continue;
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
+ break;
+ }
+ if (i >= sh->disks) {
+ atomic_inc(&sh->count);
+ set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+ set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+ ops_complete_reconstruct(sh);
+ return;
+ }
+
+again:
+ blocks = to_addr_page(percpu, j);
+
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ synflags = SYNDROME_SRC_WRITTEN;
+ txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
+ } else {
+ synflags = SYNDROME_SRC_ALL;
+ txflags = ASYNC_TX_ACK;
+ }
+
+ count = set_syndrome_sources(blocks, sh, synflags);
+ last_stripe = !head_sh->batch_head ||
+ list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list) == head_sh;
+
+ if (last_stripe) {
+ atomic_inc(&head_sh->count);
+ init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
+ head_sh, to_addr_conv(sh, percpu, j));
+ } else
+ init_async_submit(&submit, 0, tx, NULL, NULL,
+ to_addr_conv(sh, percpu, j));
+ tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+ if (!last_stripe) {
+ j++;
+ sh = list_first_entry(&sh->batch_list, struct stripe_head,
+ batch_list);
+ goto again;
+ }
+}
+
+static void ops_complete_check(void *stripe_head_ref)
+{
+ struct stripe_head *sh = stripe_head_ref;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ sh->check_state = check_state_check_result;
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+}
+
+static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+ int disks = sh->disks;
+ int pd_idx = sh->pd_idx;
+ int qd_idx = sh->qd_idx;
+ struct page *xor_dest;
+ struct page **xor_srcs = to_addr_page(percpu, 0);
+ struct dma_async_tx_descriptor *tx;
+ struct async_submit_ctl submit;
+ int count;
+ int i;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ BUG_ON(sh->batch_head);
+ count = 0;
+ xor_dest = sh->dev[pd_idx].page;
+ xor_srcs[count++] = xor_dest;
+ for (i = disks; i--; ) {
+ if (i == pd_idx || i == qd_idx)
+ continue;
+ xor_srcs[count++] = sh->dev[i].page;
+ }
+
+ init_async_submit(&submit, 0, NULL, NULL, NULL,
+ to_addr_conv(sh, percpu, 0));
+ tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+ &sh->ops.zero_sum_result, &submit);
+
+ atomic_inc(&sh->count);
+ init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
+ tx = async_trigger_callback(&submit);
+}
+
+static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
+{
+ struct page **srcs = to_addr_page(percpu, 0);
+ struct async_submit_ctl submit;
+ int count;
+
+ pr_debug("%s: stripe %llu checkp: %d\n", __func__,
+ (unsigned long long)sh->sector, checkp);
+
+ BUG_ON(sh->batch_head);
+ count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
+ if (!checkp)
+ srcs[count] = NULL;
+
+ atomic_inc(&sh->count);
+ init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
+ sh, to_addr_conv(sh, percpu, 0));
+ async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
+ &sh->ops.zero_sum_result, percpu->spare_page, &submit);
+}
+
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+{
+ int overlap_clear = 0, i, disks = sh->disks;
+ struct dma_async_tx_descriptor *tx = NULL;
+ struct r5conf *conf = sh->raid_conf;
+ int level = conf->level;
+ struct raid5_percpu *percpu;
+ unsigned long cpu;
+
+ cpu = get_cpu();
+ percpu = per_cpu_ptr(conf->percpu, cpu);
+ if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
+ ops_run_biofill(sh);
+ overlap_clear++;
+ }
+
+ if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
+ if (level < 6)
+ tx = ops_run_compute5(sh, percpu);
+ else {
+ if (sh->ops.target2 < 0 || sh->ops.target < 0)
+ tx = ops_run_compute6_1(sh, percpu);
+ else
+ tx = ops_run_compute6_2(sh, percpu);
+ }
+ /* terminate the chain if reconstruct is not set to be run */
+ if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
+ async_tx_ack(tx);
+ }
+
+ if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
+ if (level < 6)
+ tx = ops_run_prexor5(sh, percpu, tx);
+ else
+ tx = ops_run_prexor6(sh, percpu, tx);
+ }
+
+ if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
+ tx = ops_run_biodrain(sh, tx);
+ overlap_clear++;
+ }
+
+ if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
+ if (level < 6)
+ ops_run_reconstruct5(sh, percpu, tx);
+ else
+ ops_run_reconstruct6(sh, percpu, tx);
+ }
+
+ if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
+ if (sh->check_state == check_state_run)
+ ops_run_check_p(sh, percpu);
+ else if (sh->check_state == check_state_run_q)
+ ops_run_check_pq(sh, percpu, 0);
+ else if (sh->check_state == check_state_run_pq)
+ ops_run_check_pq(sh, percpu, 1);
+ else
+ BUG();
+ }
+
+ if (overlap_clear && !sh->batch_head)
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_and_clear_bit(R5_Overlap, &dev->flags))
+ wake_up(&sh->raid_conf->wait_for_overlap);
+ }
+ put_cpu();
+}
+
+static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
+{
+ struct stripe_head *sh;
+
+ sh = kmem_cache_zalloc(sc, gfp);
+ if (sh) {
+ spin_lock_init(&sh->stripe_lock);
+ spin_lock_init(&sh->batch_lock);
+ INIT_LIST_HEAD(&sh->batch_list);
+ INIT_LIST_HEAD(&sh->lru);
+ atomic_set(&sh->count, 1);
+ }
+ return sh;
+}
+static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
+{
+ struct stripe_head *sh;
+
+ sh = alloc_stripe(conf->slab_cache, gfp);
+ if (!sh)
+ return 0;
+
+ sh->raid_conf = conf;
+
+ if (grow_buffers(sh, gfp)) {
+ shrink_buffers(sh);
+ kmem_cache_free(conf->slab_cache, sh);
+ return 0;
+ }
+ sh->hash_lock_index =
+ conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
+ /* we just created an active stripe so... */
+ atomic_inc(&conf->active_stripes);
+
+ release_stripe(sh);
+ conf->max_nr_stripes++;
+ return 1;
+}
+
+static int grow_stripes(struct r5conf *conf, int num)
+{
+ struct kmem_cache *sc;
+ int devs = max(conf->raid_disks, conf->previous_raid_disks);
+
+ if (conf->mddev->gendisk)
+ sprintf(conf->cache_name[0],
+ "raid%d-%s", conf->level, mdname(conf->mddev));
+ else
+ sprintf(conf->cache_name[0],
+ "raid%d-%p", conf->level, conf->mddev);
+ sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
+
+ conf->active_name = 0;
+ sc = kmem_cache_create(conf->cache_name[conf->active_name],
+ sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
+ 0, 0, NULL);
+ if (!sc)
+ return 1;
+ conf->slab_cache = sc;
+ conf->pool_size = devs;
+ while (num--)
+ if (!grow_one_stripe(conf, GFP_KERNEL))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * scribble_len - return the required size of the scribble region
+ * @num - total number of disks in the array
+ *
+ * The size must be enough to contain:
+ * 1/ a struct page pointer for each device in the array +2
+ * 2/ room to convert each entry in (1) to its corresponding dma
+ * (dma_map_page()) or page (page_address()) address.
+ *
+ * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
+ * calculate over all devices (not just the data blocks), using zeros in place
+ * of the P and Q blocks.
+ */
+static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
+{
+ struct flex_array *ret;
+ size_t len;
+
+ len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
+ ret = flex_array_alloc(len, cnt, flags);
+ if (!ret)
+ return NULL;
+ /* always prealloc all elements, so no locking is required */
+ if (flex_array_prealloc(ret, 0, cnt, flags)) {
+ flex_array_free(ret);
+ return NULL;
+ }
+ return ret;
+}
+
+static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
+{
+ unsigned long cpu;
+ int err = 0;
+
+ mddev_suspend(conf->mddev);
+ get_online_cpus();
+ for_each_present_cpu(cpu) {
+ struct raid5_percpu *percpu;
+ struct flex_array *scribble;
+
+ percpu = per_cpu_ptr(conf->percpu, cpu);
+ scribble = scribble_alloc(new_disks,
+ new_sectors / STRIPE_SECTORS,
+ GFP_NOIO);
+
+ if (scribble) {
+ flex_array_free(percpu->scribble);
+ percpu->scribble = scribble;
+ } else {
+ err = -ENOMEM;
+ break;
+ }
+ }
+ put_online_cpus();
+ mddev_resume(conf->mddev);
+ return err;
+}
+
+static int resize_stripes(struct r5conf *conf, int newsize)
+{
+ /* Make all the stripes able to hold 'newsize' devices.
+ * New slots in each stripe get 'page' set to a new page.
+ *
+ * This happens in stages:
+ * 1/ create a new kmem_cache and allocate the required number of
+ * stripe_heads.
+ * 2/ gather all the old stripe_heads and transfer the pages across
+ * to the new stripe_heads. This will have the side effect of
+ * freezing the array as once all stripe_heads have been collected,
+ * no IO will be possible. Old stripe heads are freed once their
+ * pages have been transferred over, and the old kmem_cache is
+ * freed when all stripes are done.
+ * 3/ reallocate conf->disks to be suitable bigger. If this fails,
+ * we simple return a failre status - no need to clean anything up.
+ * 4/ allocate new pages for the new slots in the new stripe_heads.
+ * If this fails, we don't bother trying the shrink the
+ * stripe_heads down again, we just leave them as they are.
+ * As each stripe_head is processed the new one is released into
+ * active service.
+ *
+ * Once step2 is started, we cannot afford to wait for a write,
+ * so we use GFP_NOIO allocations.
+ */
+ struct stripe_head *osh, *nsh;
+ LIST_HEAD(newstripes);
+ struct disk_info *ndisks;
+ int err;
+ struct kmem_cache *sc;
+ int i;
+ int hash, cnt;
+
+ if (newsize <= conf->pool_size)
+ return 0; /* never bother to shrink */
+
+ err = md_allow_write(conf->mddev);
+ if (err)
+ return err;
+
+ /* Step 1 */
+ sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
+ sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
+ 0, 0, NULL);
+ if (!sc)
+ return -ENOMEM;
+
+ for (i = conf->max_nr_stripes; i; i--) {
+ nsh = alloc_stripe(sc, GFP_KERNEL);
+ if (!nsh)
+ break;
+
+ nsh->raid_conf = conf;
+ list_add(&nsh->lru, &newstripes);
+ }
+ if (i) {
+ /* didn't get enough, give up */
+ while (!list_empty(&newstripes)) {
+ nsh = list_entry(newstripes.next, struct stripe_head, lru);
+ list_del(&nsh->lru);
+ kmem_cache_free(sc, nsh);
+ }
+ kmem_cache_destroy(sc);
+ return -ENOMEM;
+ }
+ /* Step 2 - Must use GFP_NOIO now.
+ * OK, we have enough stripes, start collecting inactive
+ * stripes and copying them over
+ */
+ hash = 0;
+ cnt = 0;
+ list_for_each_entry(nsh, &newstripes, lru) {
+ lock_device_hash_lock(conf, hash);
+ wait_event_cmd(conf->wait_for_stripe,
+ !list_empty(conf->inactive_list + hash),
+ unlock_device_hash_lock(conf, hash),
+ lock_device_hash_lock(conf, hash));
+ osh = get_free_stripe(conf, hash);
+ unlock_device_hash_lock(conf, hash);
+
+ for(i=0; i<conf->pool_size; i++) {
+ nsh->dev[i].page = osh->dev[i].page;
+ nsh->dev[i].orig_page = osh->dev[i].page;
+ }
+ nsh->hash_lock_index = hash;
+ kmem_cache_free(conf->slab_cache, osh);
+ cnt++;
+ if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+ !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+ hash++;
+ cnt = 0;
+ }
+ }
+ kmem_cache_destroy(conf->slab_cache);
+
+ /* Step 3.
+ * At this point, we are holding all the stripes so the array
+ * is completely stalled, so now is a good time to resize
+ * conf->disks and the scribble region
+ */
+ ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+ if (ndisks) {
+ for (i=0; i<conf->raid_disks; i++)
+ ndisks[i] = conf->disks[i];
+ kfree(conf->disks);
+ conf->disks = ndisks;
+ } else
+ err = -ENOMEM;
+
+ /* Step 4, return new stripes to service */
+ while(!list_empty(&newstripes)) {
+ nsh = list_entry(newstripes.next, struct stripe_head, lru);
+ list_del_init(&nsh->lru);
+
+ for (i=conf->raid_disks; i < newsize; i++)
+ if (nsh->dev[i].page == NULL) {
+ struct page *p = alloc_page(GFP_NOIO);
+ nsh->dev[i].page = p;
+ nsh->dev[i].orig_page = p;
+ if (!p)
+ err = -ENOMEM;
+ }
+ release_stripe(nsh);
+ }
+ /* critical section pass, GFP_NOIO no longer needed */
+
+ conf->slab_cache = sc;
+ conf->active_name = 1-conf->active_name;
+ if (!err)
+ conf->pool_size = newsize;
+ return err;
+}
+
+static int drop_one_stripe(struct r5conf *conf)
+{
+ struct stripe_head *sh;
+ int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
+
+ spin_lock_irq(conf->hash_locks + hash);
+ sh = get_free_stripe(conf, hash);
+ spin_unlock_irq(conf->hash_locks + hash);
+ if (!sh)
+ return 0;
+ BUG_ON(atomic_read(&sh->count));
+ shrink_buffers(sh);
+ kmem_cache_free(conf->slab_cache, sh);
+ atomic_dec(&conf->active_stripes);
+ conf->max_nr_stripes--;
+ return 1;
+}
+
+static void shrink_stripes(struct r5conf *conf)
+{
+ while (conf->max_nr_stripes &&
+ drop_one_stripe(conf))
+ ;
+
+ if (conf->slab_cache)
+ kmem_cache_destroy(conf->slab_cache);
+ conf->slab_cache = NULL;
+}
+
+static void raid5_end_read_request(struct bio * bi, int error)
+{
+ struct stripe_head *sh = bi->bi_private;
+ struct r5conf *conf = sh->raid_conf;
+ int disks = sh->disks, i;
+ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ char b[BDEVNAME_SIZE];
+ struct md_rdev *rdev = NULL;
+ sector_t s;
+
+ for (i=0 ; i<disks; i++)
+ if (bi == &sh->dev[i].req)
+ break;
+
+ pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
+ (unsigned long long)sh->sector, i, atomic_read(&sh->count),
+ uptodate);
+ if (i == disks) {
+ BUG();
+ return;
+ }
+ if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+ /* If replacement finished while this request was outstanding,
+ * 'replacement' might be NULL already.
+ * In that case it moved down to 'rdev'.
+ * rdev is not removed until all requests are finished.
+ */
+ rdev = conf->disks[i].replacement;
+ if (!rdev)
+ rdev = conf->disks[i].rdev;
+
+ if (use_new_offset(conf, sh))
+ s = sh->sector + rdev->new_data_offset;
+ else
+ s = sh->sector + rdev->data_offset;
+ if (uptodate) {
+ set_bit(R5_UPTODATE, &sh->dev[i].flags);
+ if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+ /* Note that this cannot happen on a
+ * replacement device. We just fail those on
+ * any error
+ */
+ printk_ratelimited(
+ KERN_INFO
+ "md/raid:%s: read error corrected"
+ " (%lu sectors at %llu on %s)\n",
+ mdname(conf->mddev), STRIPE_SECTORS,
+ (unsigned long long)s,
+ bdevname(rdev->bdev, b));
+ atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+ clear_bit(R5_ReadError, &sh->dev[i].flags);
+ clear_bit(R5_ReWrite, &sh->dev[i].flags);
+ } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+ clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+
+ if (atomic_read(&rdev->read_errors))
+ atomic_set(&rdev->read_errors, 0);
+ } else {
+ const char *bdn = bdevname(rdev->bdev, b);
+ int retry = 0;
+ int set_bad = 0;
+
+ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+ atomic_inc(&rdev->read_errors);
+ if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+ printk_ratelimited(
+ KERN_WARNING
+ "md/raid:%s: read error on replacement device "
+ "(sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)s,
+ bdn);
+ else if (conf->mddev->degraded >= conf->max_degraded) {
+ set_bad = 1;
+ printk_ratelimited(
+ KERN_WARNING
+ "md/raid:%s: read error not correctable "
+ "(sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)s,
+ bdn);
+ } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
+ /* Oh, no!!! */
+ set_bad = 1;
+ printk_ratelimited(
+ KERN_WARNING
+ "md/raid:%s: read error NOT corrected!! "
+ "(sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)s,
+ bdn);
+ } else if (atomic_read(&rdev->read_errors)
+ > conf->max_nr_stripes)
+ printk(KERN_WARNING
+ "md/raid:%s: Too many read errors, failing device %s.\n",
+ mdname(conf->mddev), bdn);
+ else
+ retry = 1;
+ if (set_bad && test_bit(In_sync, &rdev->flags)
+ && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+ retry = 1;
+ if (retry)
+ if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ set_bit(R5_ReadError, &sh->dev[i].flags);
+ clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+ } else
+ set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
+ else {
+ clear_bit(R5_ReadError, &sh->dev[i].flags);
+ clear_bit(R5_ReWrite, &sh->dev[i].flags);
+ if (!(set_bad
+ && test_bit(In_sync, &rdev->flags)
+ && rdev_set_badblocks(
+ rdev, sh->sector, STRIPE_SECTORS, 0)))
+ md_error(conf->mddev, rdev);
+ }
+ }
+ rdev_dec_pending(rdev, conf->mddev);
+ clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+}
+
+static void raid5_end_write_request(struct bio *bi, int error)
+{
+ struct stripe_head *sh = bi->bi_private;
+ struct r5conf *conf = sh->raid_conf;
+ int disks = sh->disks, i;
+ struct md_rdev *uninitialized_var(rdev);
+ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ sector_t first_bad;
+ int bad_sectors;
+ int replacement = 0;
+
+ for (i = 0 ; i < disks; i++) {
+ if (bi == &sh->dev[i].req) {
+ rdev = conf->disks[i].rdev;
+ break;
+ }
+ if (bi == &sh->dev[i].rreq) {
+ rdev = conf->disks[i].replacement;
+ if (rdev)
+ replacement = 1;
+ else
+ /* rdev was removed and 'replacement'
+ * replaced it. rdev is not removed
+ * until all requests are finished.
+ */
+ rdev = conf->disks[i].rdev;
+ break;
+ }
+ }
+ pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+ (unsigned long long)sh->sector, i, atomic_read(&sh->count),
+ uptodate);
+ if (i == disks) {
+ BUG();
+ return;
+ }
+
+ if (replacement) {
+ if (!uptodate)
+ md_error(conf->mddev, rdev);
+ else if (is_badblock(rdev, sh->sector,
+ STRIPE_SECTORS,
+ &first_bad, &bad_sectors))
+ set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
+ } else {
+ if (!uptodate) {
+ set_bit(STRIPE_DEGRADED, &sh->state);
+ set_bit(WriteErrorSeen, &rdev->flags);
+ set_bit(R5_WriteError, &sh->dev[i].flags);
+ if (!test_and_set_bit(WantReplacement, &rdev->flags))
+ set_bit(MD_RECOVERY_NEEDED,
+ &rdev->mddev->recovery);
+ } else if (is_badblock(rdev, sh->sector,
+ STRIPE_SECTORS,
+ &first_bad, &bad_sectors)) {
+ set_bit(R5_MadeGood, &sh->dev[i].flags);
+ if (test_bit(R5_ReadError, &sh->dev[i].flags))
+ /* That was a successful write so make
+ * sure it looks like we already did
+ * a re-write.
+ */
+ set_bit(R5_ReWrite, &sh->dev[i].flags);
+ }
+ }
+ rdev_dec_pending(rdev, conf->mddev);
+
+ if (sh->batch_head && !uptodate && !replacement)
+ set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
+ if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
+ clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+
+ if (sh->batch_head && sh != sh->batch_head)
+ release_stripe(sh->batch_head);
+}
+
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
+
+static void raid5_build_block(struct stripe_head *sh, int i, int previous)
+{
+ struct r5dev *dev = &sh->dev[i];
+
+ bio_init(&dev->req);
+ dev->req.bi_io_vec = &dev->vec;
+ dev->req.bi_max_vecs = 1;
+ dev->req.bi_private = sh;
+
+ bio_init(&dev->rreq);
+ dev->rreq.bi_io_vec = &dev->rvec;
+ dev->rreq.bi_max_vecs = 1;
+ dev->rreq.bi_private = sh;
+
+ dev->flags = 0;
+ dev->sector = compute_blocknr(sh, i, previous);
+}
+
+static void error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ struct r5conf *conf = mddev->private;
+ unsigned long flags;
+ pr_debug("raid456: error called\n");
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ clear_bit(In_sync, &rdev->flags);
+ mddev->degraded = calc_degraded(conf);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+
+ set_bit(Blocked, &rdev->flags);
+ set_bit(Faulty, &rdev->flags);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ printk(KERN_ALERT
+ "md/raid:%s: Disk failure on %s, disabling device.\n"
+ "md/raid:%s: Operation continuing on %d devices.\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ mdname(mddev),
+ conf->raid_disks - mddev->degraded);
+}
+
+/*
+ * Input: a 'big' sector number,
+ * Output: index of the data and parity disk, and the sector # in them.
+ */
+static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+ int previous, int *dd_idx,
+ struct stripe_head *sh)
+{
+ sector_t stripe, stripe2;
+ sector_t chunk_number;
+ unsigned int chunk_offset;
+ int pd_idx, qd_idx;
+ int ddf_layout = 0;
+ sector_t new_sector;
+ int algorithm = previous ? conf->prev_algo
+ : conf->algorithm;
+ int sectors_per_chunk = previous ? conf->prev_chunk_sectors
+ : conf->chunk_sectors;
+ int raid_disks = previous ? conf->previous_raid_disks
+ : conf->raid_disks;
+ int data_disks = raid_disks - conf->max_degraded;
+
+ /* First compute the information on this sector */
+
+ /*
+ * Compute the chunk number and the sector offset inside the chunk
+ */
+ chunk_offset = sector_div(r_sector, sectors_per_chunk);
+ chunk_number = r_sector;
+
+ /*
+ * Compute the stripe number
+ */
+ stripe = chunk_number;
+ *dd_idx = sector_div(stripe, data_disks);
+ stripe2 = stripe;
+ /*
+ * Select the parity disk based on the user selected algorithm.
+ */
+ pd_idx = qd_idx = -1;
+ switch(conf->level) {
+ case 4:
+ pd_idx = data_disks;
+ break;
+ case 5:
+ switch (algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ pd_idx = data_disks - sector_div(stripe2, raid_disks);
+ if (*dd_idx >= pd_idx)
+ (*dd_idx)++;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ pd_idx = sector_div(stripe2, raid_disks);
+ if (*dd_idx >= pd_idx)
+ (*dd_idx)++;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ pd_idx = data_disks - sector_div(stripe2, raid_disks);
+ *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ pd_idx = sector_div(stripe2, raid_disks);
+ *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+ break;
+ case ALGORITHM_PARITY_0:
+ pd_idx = 0;
+ (*dd_idx)++;
+ break;
+ case ALGORITHM_PARITY_N:
+ pd_idx = data_disks;
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case 6:
+
+ switch (algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
+ qd_idx = pd_idx + 1;
+ if (pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+ qd_idx = 0;
+ } else if (*dd_idx >= pd_idx)
+ (*dd_idx) += 2; /* D D P Q D */
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ pd_idx = sector_div(stripe2, raid_disks);
+ qd_idx = pd_idx + 1;
+ if (pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+ qd_idx = 0;
+ } else if (*dd_idx >= pd_idx)
+ (*dd_idx) += 2; /* D D P Q D */
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
+ qd_idx = (pd_idx + 1) % raid_disks;
+ *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ pd_idx = sector_div(stripe2, raid_disks);
+ qd_idx = (pd_idx + 1) % raid_disks;
+ *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
+ break;
+
+ case ALGORITHM_PARITY_0:
+ pd_idx = 0;
+ qd_idx = 1;
+ (*dd_idx) += 2;
+ break;
+ case ALGORITHM_PARITY_N:
+ pd_idx = data_disks;
+ qd_idx = data_disks + 1;
+ break;
+
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ /* Exactly the same as RIGHT_ASYMMETRIC, but or
+ * of blocks for computing Q is different.
+ */
+ pd_idx = sector_div(stripe2, raid_disks);
+ qd_idx = pd_idx + 1;
+ if (pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+ qd_idx = 0;
+ } else if (*dd_idx >= pd_idx)
+ (*dd_idx) += 2; /* D D P Q D */
+ ddf_layout = 1;
+ break;
+
+ case ALGORITHM_ROTATING_N_RESTART:
+ /* Same a left_asymmetric, by first stripe is
+ * D D D P Q rather than
+ * Q D D D P
+ */
+ stripe2 += 1;
+ pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
+ qd_idx = pd_idx + 1;
+ if (pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+ qd_idx = 0;
+ } else if (*dd_idx >= pd_idx)
+ (*dd_idx) += 2; /* D D P Q D */
+ ddf_layout = 1;
+ break;
+
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ /* Same as left_symmetric but Q is before P */
+ pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
+ qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
+ *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+ ddf_layout = 1;
+ break;
+
+ case ALGORITHM_LEFT_ASYMMETRIC_6:
+ /* RAID5 left_asymmetric, with Q on last device */
+ pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
+ if (*dd_idx >= pd_idx)
+ (*dd_idx)++;
+ qd_idx = raid_disks - 1;
+ break;
+
+ case ALGORITHM_RIGHT_ASYMMETRIC_6:
+ pd_idx = sector_div(stripe2, raid_disks-1);
+ if (*dd_idx >= pd_idx)
+ (*dd_idx)++;
+ qd_idx = raid_disks - 1;
+ break;
+
+ case ALGORITHM_LEFT_SYMMETRIC_6:
+ pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
+ *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+ qd_idx = raid_disks - 1;
+ break;
+
+ case ALGORITHM_RIGHT_SYMMETRIC_6:
+ pd_idx = sector_div(stripe2, raid_disks-1);
+ *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+ qd_idx = raid_disks - 1;
+ break;
+
+ case ALGORITHM_PARITY_0_6:
+ pd_idx = 0;
+ (*dd_idx)++;
+ qd_idx = raid_disks - 1;
+ break;
+
+ default:
+ BUG();
+ }
+ break;
+ }
+
+ if (sh) {
+ sh->pd_idx = pd_idx;
+ sh->qd_idx = qd_idx;
+ sh->ddf_layout = ddf_layout;
+ }
+ /*
+ * Finally, compute the new sector number
+ */
+ new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
+ return new_sector;
+}
+
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
+{
+ struct r5conf *conf = sh->raid_conf;
+ int raid_disks = sh->disks;
+ int data_disks = raid_disks - conf->max_degraded;
+ sector_t new_sector = sh->sector, check;
+ int sectors_per_chunk = previous ? conf->prev_chunk_sectors
+ : conf->chunk_sectors;
+ int algorithm = previous ? conf->prev_algo
+ : conf->algorithm;
+ sector_t stripe;
+ int chunk_offset;
+ sector_t chunk_number;
+ int dummy1, dd_idx = i;
+ sector_t r_sector;
+ struct stripe_head sh2;
+
+ chunk_offset = sector_div(new_sector, sectors_per_chunk);
+ stripe = new_sector;
+
+ if (i == sh->pd_idx)
+ return 0;
+ switch(conf->level) {
+ case 4: break;
+ case 5:
+ switch (algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ if (i > sh->pd_idx)
+ i--;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ if (i < sh->pd_idx)
+ i += raid_disks;
+ i -= (sh->pd_idx + 1);
+ break;
+ case ALGORITHM_PARITY_0:
+ i -= 1;
+ break;
+ case ALGORITHM_PARITY_N:
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case 6:
+ if (i == sh->qd_idx)
+ return 0; /* It is the Q disk */
+ switch (algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ case ALGORITHM_ROTATING_N_RESTART:
+ if (sh->pd_idx == raid_disks-1)
+ i--; /* Q D D D P */
+ else if (i > sh->pd_idx)
+ i -= 2; /* D D P Q D */
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ if (sh->pd_idx == raid_disks-1)
+ i--; /* Q D D D P */
+ else {
+ /* D D P Q D */
+ if (i < sh->pd_idx)
+ i += raid_disks;
+ i -= (sh->pd_idx + 2);
+ }
+ break;
+ case ALGORITHM_PARITY_0:
+ i -= 2;
+ break;
+ case ALGORITHM_PARITY_N:
+ break;
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ /* Like left_symmetric, but P is before Q */
+ if (sh->pd_idx == 0)
+ i--; /* P D D D Q */
+ else {
+ /* D D Q P D */
+ if (i < sh->pd_idx)
+ i += raid_disks;
+ i -= (sh->pd_idx + 1);
+ }
+ break;
+ case ALGORITHM_LEFT_ASYMMETRIC_6:
+ case ALGORITHM_RIGHT_ASYMMETRIC_6:
+ if (i > sh->pd_idx)
+ i--;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC_6:
+ case ALGORITHM_RIGHT_SYMMETRIC_6:
+ if (i < sh->pd_idx)
+ i += data_disks + 1;
+ i -= (sh->pd_idx + 1);
+ break;
+ case ALGORITHM_PARITY_0_6:
+ i -= 1;
+ break;
+ default:
+ BUG();
+ }
+ break;
+ }
+
+ chunk_number = stripe * data_disks + i;
+ r_sector = chunk_number * sectors_per_chunk + chunk_offset;
+
+ check = raid5_compute_sector(conf, r_sector,
+ previous, &dummy1, &sh2);
+ if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
+ || sh2.qd_idx != sh->qd_idx) {
+ printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
+ mdname(conf->mddev));
+ return 0;
+ }
+ return r_sector;
+}
+
+static void
+schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
+ int rcw, int expand)
+{
+ int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
+ struct r5conf *conf = sh->raid_conf;
+ int level = conf->level;
+
+ if (rcw) {
+
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+
+ if (dev->towrite) {
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantdrain, &dev->flags);
+ if (!expand)
+ clear_bit(R5_UPTODATE, &dev->flags);
+ s->locked++;
+ }
+ }
+ /* if we are not expanding this is a proper write request, and
+ * there will be bios with new data to be drained into the
+ * stripe cache
+ */
+ if (!expand) {
+ if (!s->locked)
+ /* False alarm, nothing to do */
+ return;
+ sh->reconstruct_state = reconstruct_state_drain_run;
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ } else
+ sh->reconstruct_state = reconstruct_state_run;
+
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
+
+ if (s->locked + conf->max_degraded == disks)
+ if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+ atomic_inc(&conf->pending_full_writes);
+ } else {
+ BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
+ test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+ BUG_ON(level == 6 &&
+ (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
+ test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
+
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if (i == pd_idx || i == qd_idx)
+ continue;
+
+ if (dev->towrite &&
+ (test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Wantcompute, &dev->flags))) {
+ set_bit(R5_Wantdrain, &dev->flags);
+ set_bit(R5_LOCKED, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ s->locked++;
+ }
+ }
+ if (!s->locked)
+ /* False alarm - nothing to do */
+ return;
+ sh->reconstruct_state = reconstruct_state_prexor_drain_run;
+ set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
+ }
+
+ /* keep the parity disk(s) locked while asynchronous operations
+ * are in flight
+ */
+ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+ s->locked++;
+
+ if (level == 6) {
+ int qd_idx = sh->qd_idx;
+ struct r5dev *dev = &sh->dev[qd_idx];
+
+ set_bit(R5_LOCKED, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ s->locked++;
+ }
+
+ pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
+ __func__, (unsigned long long)sh->sector,
+ s->locked, s->ops_request);
+}
+
+/*
+ * Each stripe/dev can have one or more bion attached.
+ * toread/towrite point to the first in a chain.
+ * The bi_next chain must be in order.
+ */
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
+ int forwrite, int previous)
+{
+ struct bio **bip;
+ struct r5conf *conf = sh->raid_conf;
+ int firstwrite=0;
+
+ pr_debug("adding bi b#%llu to stripe s#%llu\n",
+ (unsigned long long)bi->bi_iter.bi_sector,
+ (unsigned long long)sh->sector);
+
+ /*
+ * If several bio share a stripe. The bio bi_phys_segments acts as a
+ * reference count to avoid race. The reference count should already be
+ * increased before this function is called (for example, in
+ * make_request()), so other bio sharing this stripe will not free the
+ * stripe. If a stripe is owned by one stripe, the stripe lock will
+ * protect it.
+ */
+ spin_lock_irq(&sh->stripe_lock);
+ /* Don't allow new IO added to stripes in batch list */
+ if (sh->batch_head)
+ goto overlap;
+ if (forwrite) {
+ bip = &sh->dev[dd_idx].towrite;
+ if (*bip == NULL)
+ firstwrite = 1;
+ } else
+ bip = &sh->dev[dd_idx].toread;
+ while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
+ if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
+ goto overlap;
+ bip = & (*bip)->bi_next;
+ }
+ if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
+ goto overlap;
+
+ if (!forwrite || previous)
+ clear_bit(STRIPE_BATCH_READY, &sh->state);
+
+ BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
+ if (*bip)
+ bi->bi_next = *bip;
+ *bip = bi;
+ raid5_inc_bi_active_stripes(bi);
+
+ if (forwrite) {
+ /* check if page is covered */
+ sector_t sector = sh->dev[dd_idx].sector;
+ for (bi=sh->dev[dd_idx].towrite;
+ sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
+ bi && bi->bi_iter.bi_sector <= sector;
+ bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
+ if (bio_end_sector(bi) >= sector)
+ sector = bio_end_sector(bi);
+ }
+ if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+ if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
+ sh->overwrite_disks++;
+ }
+
+ pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
+ (unsigned long long)(*bip)->bi_iter.bi_sector,
+ (unsigned long long)sh->sector, dd_idx);
+
+ if (conf->mddev->bitmap && firstwrite) {
+ /* Cannot hold spinlock over bitmap_startwrite,
+ * but must ensure this isn't added to a batch until
+ * we have added to the bitmap and set bm_seq.
+ * So set STRIPE_BITMAP_PENDING to prevent
+ * batching.
+ * If multiple add_stripe_bio() calls race here they
+ * much all set STRIPE_BITMAP_PENDING. So only the first one
+ * to complete "bitmap_startwrite" gets to set
+ * STRIPE_BIT_DELAY. This is important as once a stripe
+ * is added to a batch, STRIPE_BIT_DELAY cannot be changed
+ * any more.
+ */
+ set_bit(STRIPE_BITMAP_PENDING, &sh->state);
+ spin_unlock_irq(&sh->stripe_lock);
+ bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0);
+ spin_lock_irq(&sh->stripe_lock);
+ clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
+ if (!sh->batch_head) {
+ sh->bm_seq = conf->seq_flush+1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+ }
+ spin_unlock_irq(&sh->stripe_lock);
+
+ if (stripe_can_batch(sh))
+ stripe_add_to_batch_list(conf, sh);
+ return 1;
+
+ overlap:
+ set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+ spin_unlock_irq(&sh->stripe_lock);
+ return 0;
+}
+
+static void end_reshape(struct r5conf *conf);
+
+static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
+ struct stripe_head *sh)
+{
+ int sectors_per_chunk =
+ previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
+ int dd_idx;
+ int chunk_offset = sector_div(stripe, sectors_per_chunk);
+ int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
+
+ raid5_compute_sector(conf,
+ stripe * (disks - conf->max_degraded)
+ *sectors_per_chunk + chunk_offset,
+ previous,
+ &dd_idx, sh);
+}
+
+static void
+handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s, int disks,
+ struct bio **return_bi)
+{
+ int i;
+ BUG_ON(sh->batch_head);
+ for (i = disks; i--; ) {
+ struct bio *bi;
+ int bitmap_end = 0;
+
+ if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+ struct md_rdev *rdev;
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev && test_bit(In_sync, &rdev->flags))
+ atomic_inc(&rdev->nr_pending);
+ else
+ rdev = NULL;
+ rcu_read_unlock();
+ if (rdev) {
+ if (!rdev_set_badblocks(
+ rdev,
+ sh->sector,
+ STRIPE_SECTORS, 0))
+ md_error(conf->mddev, rdev);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ }
+ spin_lock_irq(&sh->stripe_lock);
+ /* fail all writes first */
+ bi = sh->dev[i].towrite;
+ sh->dev[i].towrite = NULL;
+ sh->overwrite_disks = 0;
+ spin_unlock_irq(&sh->stripe_lock);
+ if (bi)
+ bitmap_end = 1;
+
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ wake_up(&conf->wait_for_overlap);
+
+ while (bi && bi->bi_iter.bi_sector <
+ sh->dev[i].sector + STRIPE_SECTORS) {
+ struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+ clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ if (!raid5_dec_bi_active_stripes(bi)) {
+ md_write_end(conf->mddev);
+ bi->bi_next = *return_bi;
+ *return_bi = bi;
+ }
+ bi = nextbi;
+ }
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0, 0);
+ bitmap_end = 0;
+ /* and fail all 'written' */
+ bi = sh->dev[i].written;
+ sh->dev[i].written = NULL;
+ if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].page = sh->dev[i].orig_page;
+ }
+
+ if (bi) bitmap_end = 1;
+ while (bi && bi->bi_iter.bi_sector <
+ sh->dev[i].sector + STRIPE_SECTORS) {
+ struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+ clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ if (!raid5_dec_bi_active_stripes(bi)) {
+ md_write_end(conf->mddev);
+ bi->bi_next = *return_bi;
+ *return_bi = bi;
+ }
+ bi = bi2;
+ }
+
+ /* fail any reads if this device is non-operational and
+ * the data has not reached the cache yet.
+ */
+ if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
+ (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+ test_bit(R5_ReadError, &sh->dev[i].flags))) {
+ spin_lock_irq(&sh->stripe_lock);
+ bi = sh->dev[i].toread;
+ sh->dev[i].toread = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ wake_up(&conf->wait_for_overlap);
+ while (bi && bi->bi_iter.bi_sector <
+ sh->dev[i].sector + STRIPE_SECTORS) {
+ struct bio *nextbi =
+ r5_next_bio(bi, sh->dev[i].sector);
+ clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ if (!raid5_dec_bi_active_stripes(bi)) {
+ bi->bi_next = *return_bi;
+ *return_bi = bi;
+ }
+ bi = nextbi;
+ }
+ }
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0, 0);
+ /* If we were in the middle of a write the parity block might
+ * still be locked - so just clear all R5_LOCKED flags
+ */
+ clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ }
+
+ if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+ if (atomic_dec_and_test(&conf->pending_full_writes))
+ md_wakeup_thread(conf->mddev->thread);
+}
+
+static void
+handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s)
+{
+ int abort = 0;
+ int i;
+
+ BUG_ON(sh->batch_head);
+ clear_bit(STRIPE_SYNCING, &sh->state);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+ wake_up(&conf->wait_for_overlap);
+ s->syncing = 0;
+ s->replacing = 0;
+ /* There is nothing more to do for sync/check/repair.
+ * Don't even need to abort as that is handled elsewhere
+ * if needed, and not always wanted e.g. if there is a known
+ * bad block here.
+ * For recover/replace we need to record a bad block on all
+ * non-sync devices, or abort the recovery
+ */
+ if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
+ /* During recovery devices cannot be removed, so
+ * locking and refcounting of rdevs is not needed
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = conf->disks[i].rdev;
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_bit(In_sync, &rdev->flags)
+ && !rdev_set_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0))
+ abort = 1;
+ rdev = conf->disks[i].replacement;
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_bit(In_sync, &rdev->flags)
+ && !rdev_set_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0))
+ abort = 1;
+ }
+ if (abort)
+ conf->recovery_disabled =
+ conf->mddev->recovery_disabled;
+ }
+ md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
+}
+
+static int want_replace(struct stripe_head *sh, int disk_idx)
+{
+ struct md_rdev *rdev;
+ int rv = 0;
+ /* Doing recovery so rcu locking not required */
+ rdev = sh->raid_conf->disks[disk_idx].replacement;
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_bit(In_sync, &rdev->flags)
+ && (rdev->recovery_offset <= sh->sector
+ || rdev->mddev->recovery_cp <= sh->sector))
+ rv = 1;
+
+ return rv;
+}
+
+/* fetch_block - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill to continue
+ */
+
+static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
+{
+ struct r5dev *dev = &sh->dev[disk_idx];
+ struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
+ &sh->dev[s->failed_num[1]] };
+ int i;
+
+
+ if (test_bit(R5_LOCKED, &dev->flags) ||
+ test_bit(R5_UPTODATE, &dev->flags))
+ /* No point reading this as we already have it or have
+ * decided to get it.
+ */
+ return 0;
+
+ if (dev->toread ||
+ (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
+ /* We need this block to directly satisfy a request */
+ return 1;
+
+ if (s->syncing || s->expanding ||
+ (s->replacing && want_replace(sh, disk_idx)))
+ /* When syncing, or expanding we read everything.
+ * When replacing, we need the replaced block.
+ */
+ return 1;
+
+ if ((s->failed >= 1 && fdev[0]->toread) ||
+ (s->failed >= 2 && fdev[1]->toread))
+ /* If we want to read from a failed device, then
+ * we need to actually read every other device.
+ */
+ return 1;
+
+ /* Sometimes neither read-modify-write nor reconstruct-write
+ * cycles can work. In those cases we read every block we
+ * can. Then the parity-update is certain to have enough to
+ * work with.
+ * This can only be a problem when we need to write something,
+ * and some device has failed. If either of those tests
+ * fail we need look no further.
+ */
+ if (!s->failed || !s->to_write)
+ return 0;
+
+ if (test_bit(R5_Insync, &dev->flags) &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ /* Pre-reads at not permitted until after short delay
+ * to gather multiple requests. However if this
+ * device is no Insync, the block could only be be computed
+ * and there is no need to delay that.
+ */
+ return 0;
+
+ for (i = 0; i < s->failed; i++) {
+ if (fdev[i]->towrite &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ /* If we have a partial write to a failed
+ * device, then we will need to reconstruct
+ * the content of that device, so all other
+ * devices must be read.
+ */
+ return 1;
+ }
+
+ /* If we are forced to do a reconstruct-write, either because
+ * the current RAID6 implementation only supports that, or
+ * or because parity cannot be trusted and we are currently
+ * recovering it, there is extra need to be careful.
+ * If one of the devices that we would need to read, because
+ * it is not being overwritten (and maybe not written at all)
+ * is missing/faulty, then we need to read everything we can.
+ */
+ if (sh->raid_conf->level != 6 &&
+ sh->sector < sh->raid_conf->mddev->recovery_cp)
+ /* reconstruct-write isn't being forced */
+ return 0;
+ for (i = 0; i < s->failed; i++) {
+ if (s->failed_num[i] != sh->pd_idx &&
+ s->failed_num[i] != sh->qd_idx &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
+{
+ struct r5dev *dev = &sh->dev[disk_idx];
+
+ /* is the data in this block needed, and can we get it? */
+ if (need_this_block(sh, s, disk_idx, disks)) {
+ /* we would like to get this block, possibly by computing it,
+ * otherwise read it if the backing disk is insync
+ */
+ BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
+ BUG_ON(test_bit(R5_Wantread, &dev->flags));
+ BUG_ON(sh->batch_head);
+ if ((s->uptodate == disks - 1) &&
+ (s->failed && (disk_idx == s->failed_num[0] ||
+ disk_idx == s->failed_num[1]))) {
+ /* have disk failed, and we're requested to fetch it;
+ * do compute it
+ */
+ pr_debug("Computing stripe %llu block %d\n",
+ (unsigned long long)sh->sector, disk_idx);
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+ set_bit(R5_Wantcompute, &dev->flags);
+ sh->ops.target = disk_idx;
+ sh->ops.target2 = -1; /* no 2nd target */
+ s->req_compute = 1;
+ /* Careful: from this point on 'uptodate' is in the eye
+ * of raid_run_ops which services 'compute' operations
+ * before writes. R5_Wantcompute flags a block that will
+ * be R5_UPTODATE by the time it is needed for a
+ * subsequent operation.
+ */
+ s->uptodate++;
+ return 1;
+ } else if (s->uptodate == disks-2 && s->failed >= 2) {
+ /* Computing 2-failure is *very* expensive; only
+ * do it if failed >= 2
+ */
+ int other;
+ for (other = disks; other--; ) {
+ if (other == disk_idx)
+ continue;
+ if (!test_bit(R5_UPTODATE,
+ &sh->dev[other].flags))
+ break;
+ }
+ BUG_ON(other < 0);
+ pr_debug("Computing stripe %llu blocks %d,%d\n",
+ (unsigned long long)sh->sector,
+ disk_idx, other);
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+ set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
+ set_bit(R5_Wantcompute, &sh->dev[other].flags);
+ sh->ops.target = disk_idx;
+ sh->ops.target2 = other;
+ s->uptodate += 2;
+ s->req_compute = 1;
+ return 1;
+ } else if (test_bit(R5_Insync, &dev->flags)) {
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantread, &dev->flags);
+ s->locked++;
+ pr_debug("Reading block %d (sync=%d)\n",
+ disk_idx, s->syncing);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * handle_stripe_fill - read or compute data to satisfy pending requests.
+ */
+static void handle_stripe_fill(struct stripe_head *sh,
+ struct stripe_head_state *s,
+ int disks)
+{
+ int i;
+
+ /* look for blocks to read/compute, skip this if a compute
+ * is already in flight, or if the stripe contents are in the
+ * midst of changing due to a write
+ */
+ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
+ !sh->reconstruct_state)
+ for (i = disks; i--; )
+ if (fetch_block(sh, s, i, disks))
+ break;
+ set_bit(STRIPE_HANDLE, &sh->state);
+}
+
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+ unsigned long handle_flags);
+/* handle_stripe_clean_event
+ * any written block on an uptodate or failed drive can be returned.
+ * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
+ * never LOCKED, so we don't need to test 'failed' directly.
+ */
+static void handle_stripe_clean_event(struct r5conf *conf,
+ struct stripe_head *sh, int disks, struct bio **return_bi)
+{
+ int i;
+ struct r5dev *dev;
+ int discard_pending = 0;
+ struct stripe_head *head_sh = sh;
+ bool do_endio = false;
+
+ for (i = disks; i--; )
+ if (sh->dev[i].written) {
+ dev = &sh->dev[i];
+ if (!test_bit(R5_LOCKED, &dev->flags) &&
+ (test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Discard, &dev->flags) ||
+ test_bit(R5_SkipCopy, &dev->flags))) {
+ /* We can return any write requests */
+ struct bio *wbi, *wbi2;
+ pr_debug("Return write for disc %d\n", i);
+ if (test_and_clear_bit(R5_Discard, &dev->flags))
+ clear_bit(R5_UPTODATE, &dev->flags);
+ if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
+ WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
+ }
+ do_endio = true;
+
+returnbi:
+ dev->page = dev->orig_page;
+ wbi = dev->written;
+ dev->written = NULL;
+ while (wbi && wbi->bi_iter.bi_sector <
+ dev->sector + STRIPE_SECTORS) {
+ wbi2 = r5_next_bio(wbi, dev->sector);
+ if (!raid5_dec_bi_active_stripes(wbi)) {
+ md_write_end(conf->mddev);
+ wbi->bi_next = *return_bi;
+ *return_bi = wbi;
+ }
+ wbi = wbi2;
+ }
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS,
+ !test_bit(STRIPE_DEGRADED, &sh->state),
+ 0);
+ if (head_sh->batch_head) {
+ sh = list_first_entry(&sh->batch_list,
+ struct stripe_head,
+ batch_list);
+ if (sh != head_sh) {
+ dev = &sh->dev[i];
+ goto returnbi;
+ }
+ }
+ sh = head_sh;
+ dev = &sh->dev[i];
+ } else if (test_bit(R5_Discard, &dev->flags))
+ discard_pending = 1;
+ WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
+ WARN_ON(dev->page != dev->orig_page);
+ }
+ if (!discard_pending &&
+ test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+ clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+ if (sh->qd_idx >= 0) {
+ clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
+ }
+ /* now that discard is done we can proceed with any sync */
+ clear_bit(STRIPE_DISCARD, &sh->state);
+ /*
+ * SCSI discard will change some bio fields and the stripe has
+ * no updated data, so remove it from hash list and the stripe
+ * will be reinitialized
+ */
+ spin_lock_irq(&conf->device_lock);
+unhash:
+ remove_hash(sh);
+ if (head_sh->batch_head) {
+ sh = list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list);
+ if (sh != head_sh)
+ goto unhash;
+ }
+ spin_unlock_irq(&conf->device_lock);
+ sh = head_sh;
+
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+ set_bit(STRIPE_HANDLE, &sh->state);
+
+ }
+
+ if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+ if (atomic_dec_and_test(&conf->pending_full_writes))
+ md_wakeup_thread(conf->mddev->thread);
+
+ if (head_sh->batch_head && do_endio)
+ break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
+}
+
+static void handle_stripe_dirtying(struct r5conf *conf,
+ struct stripe_head *sh,
+ struct stripe_head_state *s,
+ int disks)
+{
+ int rmw = 0, rcw = 0, i;
+ sector_t recovery_cp = conf->mddev->recovery_cp;
+
+ /* Check whether resync is now happening or should start.
+ * If yes, then the array is dirty (after unclean shutdown or
+ * initial creation), so parity in some stripes might be inconsistent.
+ * In this case, we need to always do reconstruct-write, to ensure
+ * that in case of drive failure or read-error correction, we
+ * generate correct data from the parity.
+ */
+ if (conf->rmw_level == PARITY_DISABLE_RMW ||
+ (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
+ s->failed == 0)) {
+ /* Calculate the real rcw later - for now make it
+ * look like rcw is cheaper
+ */
+ rcw = 1; rmw = 2;
+ pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
+ conf->rmw_level, (unsigned long long)recovery_cp,
+ (unsigned long long)sh->sector);
+ } else for (i = disks; i--; ) {
+ /* would I have to read this buffer for read_modify_write */
+ struct r5dev *dev = &sh->dev[i];
+ if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+ !test_bit(R5_LOCKED, &dev->flags) &&
+ !(test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Wantcompute, &dev->flags))) {
+ if (test_bit(R5_Insync, &dev->flags))
+ rmw++;
+ else
+ rmw += 2*disks; /* cannot read it */
+ }
+ /* Would I have to read this buffer for reconstruct_write */
+ if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+ i != sh->pd_idx && i != sh->qd_idx &&
+ !test_bit(R5_LOCKED, &dev->flags) &&
+ !(test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Wantcompute, &dev->flags))) {
+ if (test_bit(R5_Insync, &dev->flags))
+ rcw++;
+ else
+ rcw += 2*disks;
+ }
+ }
+ pr_debug("for sector %llu, rmw=%d rcw=%d\n",
+ (unsigned long long)sh->sector, rmw, rcw);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
+ /* prefer read-modify-write, but need to get some data */
+ if (conf->mddev->queue)
+ blk_add_trace_msg(conf->mddev->queue,
+ "raid5 rmw %llu %d",
+ (unsigned long long)sh->sector, rmw);
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+ !test_bit(R5_LOCKED, &dev->flags) &&
+ !(test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Wantcompute, &dev->flags)) &&
+ test_bit(R5_Insync, &dev->flags)) {
+ if (test_bit(STRIPE_PREREAD_ACTIVE,
+ &sh->state)) {
+ pr_debug("Read_old block %d for r-m-w\n",
+ i);
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantread, &dev->flags);
+ s->locked++;
+ } else {
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ }
+ }
+ }
+ if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
+ /* want reconstruct write, but need to get some data */
+ int qread =0;
+ rcw = 0;
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+ i != sh->pd_idx && i != sh->qd_idx &&
+ !test_bit(R5_LOCKED, &dev->flags) &&
+ !(test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Wantcompute, &dev->flags))) {
+ rcw++;
+ if (test_bit(R5_Insync, &dev->flags) &&
+ test_bit(STRIPE_PREREAD_ACTIVE,
+ &sh->state)) {
+ pr_debug("Read_old block "
+ "%d for Reconstruct\n", i);
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantread, &dev->flags);
+ s->locked++;
+ qread++;
+ } else {
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ }
+ }
+ if (rcw && conf->mddev->queue)
+ blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
+ (unsigned long long)sh->sector,
+ rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
+ }
+
+ if (rcw > disks && rmw > disks &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ set_bit(STRIPE_DELAYED, &sh->state);
+
+ /* now if nothing is locked, and if we have enough data,
+ * we can start a write request
+ */
+ /* since handle_stripe can be called at any time we need to handle the
+ * case where a compute block operation has been submitted and then a
+ * subsequent call wants to start a write request. raid_run_ops only
+ * handles the case where compute block and reconstruct are requested
+ * simultaneously. If this is not the case then new writes need to be
+ * held off until the compute completes.
+ */
+ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
+ (s->locked == 0 && (rcw == 0 || rmw == 0) &&
+ !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+ schedule_reconstruction(sh, s, rcw == 0, 0);
+}
+
+static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s, int disks)
+{
+ struct r5dev *dev = NULL;
+
+ BUG_ON(sh->batch_head);
+ set_bit(STRIPE_HANDLE, &sh->state);
+
+ switch (sh->check_state) {
+ case check_state_idle:
+ /* start a new check operation if there are no failures */
+ if (s->failed == 0) {
+ BUG_ON(s->uptodate != disks);
+ sh->check_state = check_state_run;
+ set_bit(STRIPE_OP_CHECK, &s->ops_request);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+ s->uptodate--;
+ break;
+ }
+ dev = &sh->dev[s->failed_num[0]];
+ /* fall through */
+ case check_state_compute_result:
+ sh->check_state = check_state_idle;
+ if (!dev)
+ dev = &sh->dev[sh->pd_idx];
+
+ /* check that a write has not made the stripe insync */
+ if (test_bit(STRIPE_INSYNC, &sh->state))
+ break;
+
+ /* either failed parity check, or recovery is happening */
+ BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
+ BUG_ON(s->uptodate != disks);
+
+ set_bit(R5_LOCKED, &dev->flags);
+ s->locked++;
+ set_bit(R5_Wantwrite, &dev->flags);
+
+ clear_bit(STRIPE_DEGRADED, &sh->state);
+ set_bit(STRIPE_INSYNC, &sh->state);
+ break;
+ case check_state_run:
+ break; /* we will be called again upon completion */
+ case check_state_check_result:
+ sh->check_state = check_state_idle;
+
+ /* if a failure occurred during the check operation, leave
+ * STRIPE_INSYNC not set and let the stripe be handled again
+ */
+ if (s->failed)
+ break;
+
+ /* handle a successful check operation, if parity is correct
+ * we are done. Otherwise update the mismatch count and repair
+ * parity if !MD_RECOVERY_CHECK
+ */
+ if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
+ /* parity is correct (on disc,
+ * not in buffer any more)
+ */
+ set_bit(STRIPE_INSYNC, &sh->state);
+ else {
+ atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ /* don't try to repair!! */
+ set_bit(STRIPE_INSYNC, &sh->state);
+ else {
+ sh->check_state = check_state_compute_run;
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+ set_bit(R5_Wantcompute,
+ &sh->dev[sh->pd_idx].flags);
+ sh->ops.target = sh->pd_idx;
+ sh->ops.target2 = -1;
+ s->uptodate++;
+ }
+ }
+ break;
+ case check_state_compute_run:
+ break;
+ default:
+ printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+ __func__, sh->check_state,
+ (unsigned long long) sh->sector);
+ BUG();
+ }
+}
+
+static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s,
+ int disks)
+{
+ int pd_idx = sh->pd_idx;
+ int qd_idx = sh->qd_idx;
+ struct r5dev *dev;
+
+ BUG_ON(sh->batch_head);
+ set_bit(STRIPE_HANDLE, &sh->state);
+
+ BUG_ON(s->failed > 2);
+
+ /* Want to check and possibly repair P and Q.
+ * However there could be one 'failed' device, in which
+ * case we can only check one of them, possibly using the
+ * other to generate missing data
+ */
+
+ switch (sh->check_state) {
+ case check_state_idle:
+ /* start a new check operation if there are < 2 failures */
+ if (s->failed == s->q_failed) {
+ /* The only possible failed device holds Q, so it
+ * makes sense to check P (If anything else were failed,
+ * we would have used P to recreate it).
+ */
+ sh->check_state = check_state_run;
+ }
+ if (!s->q_failed && s->failed < 2) {
+ /* Q is not failed, and we didn't use it to generate
+ * anything, so it makes sense to check it
+ */
+ if (sh->check_state == check_state_run)
+ sh->check_state = check_state_run_pq;
+ else
+ sh->check_state = check_state_run_q;
+ }
+
+ /* discard potentially stale zero_sum_result */
+ sh->ops.zero_sum_result = 0;
+
+ if (sh->check_state == check_state_run) {
+ /* async_xor_zero_sum destroys the contents of P */
+ clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+ s->uptodate--;
+ }
+ if (sh->check_state >= check_state_run &&
+ sh->check_state <= check_state_run_pq) {
+ /* async_syndrome_zero_sum preserves P and Q, so
+ * no need to mark them !uptodate here
+ */
+ set_bit(STRIPE_OP_CHECK, &s->ops_request);
+ break;
+ }
+
+ /* we have 2-disk failure */
+ BUG_ON(s->failed != 2);
+ /* fall through */
+ case check_state_compute_result:
+ sh->check_state = check_state_idle;
+
+ /* check that a write has not made the stripe insync */
+ if (test_bit(STRIPE_INSYNC, &sh->state))
+ break;
+
+ /* now write out any block on a failed drive,
+ * or P or Q if they were recomputed
+ */
+ BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
+ if (s->failed == 2) {
+ dev = &sh->dev[s->failed_num[1]];
+ s->locked++;
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantwrite, &dev->flags);
+ }
+ if (s->failed >= 1) {
+ dev = &sh->dev[s->failed_num[0]];
+ s->locked++;
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantwrite, &dev->flags);
+ }
+ if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
+ dev = &sh->dev[pd_idx];
+ s->locked++;
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantwrite, &dev->flags);
+ }
+ if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
+ dev = &sh->dev[qd_idx];
+ s->locked++;
+ set_bit(R5_LOCKED, &dev->flags);
+ set_bit(R5_Wantwrite, &dev->flags);
+ }
+ clear_bit(STRIPE_DEGRADED, &sh->state);
+
+ set_bit(STRIPE_INSYNC, &sh->state);
+ break;
+ case check_state_run:
+ case check_state_run_q:
+ case check_state_run_pq:
+ break; /* we will be called again upon completion */
+ case check_state_check_result:
+ sh->check_state = check_state_idle;
+
+ /* handle a successful check operation, if parity is correct
+ * we are done. Otherwise update the mismatch count and repair
+ * parity if !MD_RECOVERY_CHECK
+ */
+ if (sh->ops.zero_sum_result == 0) {
+ /* both parities are correct */
+ if (!s->failed)
+ set_bit(STRIPE_INSYNC, &sh->state);
+ else {
+ /* in contrast to the raid5 case we can validate
+ * parity, but still have a failure to write
+ * back
+ */
+ sh->check_state = check_state_compute_result;
+ /* Returning at this point means that we may go
+ * off and bring p and/or q uptodate again so
+ * we make sure to check zero_sum_result again
+ * to verify if p or q need writeback
+ */
+ }
+ } else {
+ atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ /* don't try to repair!! */
+ set_bit(STRIPE_INSYNC, &sh->state);
+ else {
+ int *target = &sh->ops.target;
+
+ sh->ops.target = -1;
+ sh->ops.target2 = -1;
+ sh->check_state = check_state_compute_run;
+ set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+ set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+ if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
+ set_bit(R5_Wantcompute,
+ &sh->dev[pd_idx].flags);
+ *target = pd_idx;
+ target = &sh->ops.target2;
+ s->uptodate++;
+ }
+ if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
+ set_bit(R5_Wantcompute,
+ &sh->dev[qd_idx].flags);
+ *target = qd_idx;
+ s->uptodate++;
+ }
+ }
+ }
+ break;
+ case check_state_compute_run:
+ break;
+ default:
+ printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+ __func__, sh->check_state,
+ (unsigned long long) sh->sector);
+ BUG();
+ }
+}
+
+static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
+{
+ int i;
+
+ /* We have read all the blocks in this stripe and now we need to
+ * copy some of them into a target stripe for expand.
+ */
+ struct dma_async_tx_descriptor *tx = NULL;
+ BUG_ON(sh->batch_head);
+ clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ for (i = 0; i < sh->disks; i++)
+ if (i != sh->pd_idx && i != sh->qd_idx) {
+ int dd_idx, j;
+ struct stripe_head *sh2;
+ struct async_submit_ctl submit;
+
+ sector_t bn = compute_blocknr(sh, i, 1);
+ sector_t s = raid5_compute_sector(conf, bn, 0,
+ &dd_idx, NULL);
+ sh2 = get_active_stripe(conf, s, 0, 1, 1);
+ if (sh2 == NULL)
+ /* so far only the early blocks of this stripe
+ * have been requested. When later blocks
+ * get requested, we will try again
+ */
+ continue;
+ if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+ test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
+ /* must have already done this block */
+ release_stripe(sh2);
+ continue;
+ }
+
+ /* place all the copies on one channel */
+ init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
+ tx = async_memcpy(sh2->dev[dd_idx].page,
+ sh->dev[i].page, 0, 0, STRIPE_SIZE,
+ &submit);
+
+ set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
+ set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+ for (j = 0; j < conf->raid_disks; j++)
+ if (j != sh2->pd_idx &&
+ j != sh2->qd_idx &&
+ !test_bit(R5_Expanded, &sh2->dev[j].flags))
+ break;
+ if (j == conf->raid_disks) {
+ set_bit(STRIPE_EXPAND_READY, &sh2->state);
+ set_bit(STRIPE_HANDLE, &sh2->state);
+ }
+ release_stripe(sh2);
+
+ }
+ /* done submitting copies, wait for them to complete */
+ async_tx_quiesce(&tx);
+}
+
+/*
+ * handle_stripe - do things to a stripe.
+ *
+ * We lock the stripe by setting STRIPE_ACTIVE and then examine the
+ * state of various bits to see what needs to be done.
+ * Possible results:
+ * return some read requests which now have data
+ * return some write requests which are safely on storage
+ * schedule a read on some buffers
+ * schedule a write of some buffers
+ * return confirmation of parity correctness
+ *
+ */
+
+static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
+{
+ struct r5conf *conf = sh->raid_conf;
+ int disks = sh->disks;
+ struct r5dev *dev;
+ int i;
+ int do_recovery = 0;
+
+ memset(s, 0, sizeof(*s));
+
+ s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+ s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
+ s->failed_num[0] = -1;
+ s->failed_num[1] = -1;
+
+ /* Now to look around and see what can be done */
+ rcu_read_lock();
+ for (i=disks; i--; ) {
+ struct md_rdev *rdev;
+ sector_t first_bad;
+ int bad_sectors;
+ int is_bad = 0;
+
+ dev = &sh->dev[i];
+
+ pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
+ i, dev->flags,
+ dev->toread, dev->towrite, dev->written);
+ /* maybe we can reply to a read
+ *
+ * new wantfill requests are only permitted while
+ * ops_complete_biofill is guaranteed to be inactive
+ */
+ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
+ !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
+ set_bit(R5_Wantfill, &dev->flags);
+
+ /* now count some things */
+ if (test_bit(R5_LOCKED, &dev->flags))
+ s->locked++;
+ if (test_bit(R5_UPTODATE, &dev->flags))
+ s->uptodate++;
+ if (test_bit(R5_Wantcompute, &dev->flags)) {
+ s->compute++;
+ BUG_ON(s->compute > 2);
+ }
+
+ if (test_bit(R5_Wantfill, &dev->flags))
+ s->to_fill++;
+ else if (dev->toread)
+ s->to_read++;
+ if (dev->towrite) {
+ s->to_write++;
+ if (!test_bit(R5_OVERWRITE, &dev->flags))
+ s->non_overwrite++;
+ }
+ if (dev->written)
+ s->written++;
+ /* Prefer to use the replacement for reads, but only
+ * if it is recovered enough and has no bad blocks.
+ */
+ rdev = rcu_dereference(conf->disks[i].replacement);
+ if (rdev && !test_bit(Faulty, &rdev->flags) &&
+ rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
+ !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ &first_bad, &bad_sectors))
+ set_bit(R5_ReadRepl, &dev->flags);
+ else {
+ if (rdev)
+ set_bit(R5_NeedReplace, &dev->flags);
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ clear_bit(R5_ReadRepl, &dev->flags);
+ }
+ if (rdev && test_bit(Faulty, &rdev->flags))
+ rdev = NULL;
+ if (rdev) {
+ is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+ &first_bad, &bad_sectors);
+ if (s->blocked_rdev == NULL
+ && (test_bit(Blocked, &rdev->flags)
+ || is_bad < 0)) {
+ if (is_bad < 0)
+ set_bit(BlockedBadBlocks,
+ &rdev->flags);
+ s->blocked_rdev = rdev;
+ atomic_inc(&rdev->nr_pending);
+ }
+ }
+ clear_bit(R5_Insync, &dev->flags);
+ if (!rdev)
+ /* Not in-sync */;
+ else if (is_bad) {
+ /* also not in-sync */
+ if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+ test_bit(R5_UPTODATE, &dev->flags)) {
+ /* treat as in-sync, but with a read error
+ * which we can now try to correct
+ */
+ set_bit(R5_Insync, &dev->flags);
+ set_bit(R5_ReadError, &dev->flags);
+ }
+ } else if (test_bit(In_sync, &rdev->flags))
+ set_bit(R5_Insync, &dev->flags);
+ else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+ /* in sync if before recovery_offset */
+ set_bit(R5_Insync, &dev->flags);
+ else if (test_bit(R5_UPTODATE, &dev->flags) &&
+ test_bit(R5_Expanded, &dev->flags))
+ /* If we've reshaped into here, we assume it is Insync.
+ * We will shortly update recovery_offset to make
+ * it official.
+ */
+ set_bit(R5_Insync, &dev->flags);
+
+ if (test_bit(R5_WriteError, &dev->flags)) {
+ /* This flag does not apply to '.replacement'
+ * only to .rdev, so make sure to check that*/
+ struct md_rdev *rdev2 = rcu_dereference(
+ conf->disks[i].rdev);
+ if (rdev2 == rdev)
+ clear_bit(R5_Insync, &dev->flags);
+ if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
+ s->handle_bad_blocks = 1;
+ atomic_inc(&rdev2->nr_pending);
+ } else
+ clear_bit(R5_WriteError, &dev->flags);
+ }
+ if (test_bit(R5_MadeGood, &dev->flags)) {
+ /* This flag does not apply to '.replacement'
+ * only to .rdev, so make sure to check that*/
+ struct md_rdev *rdev2 = rcu_dereference(
+ conf->disks[i].rdev);
+ if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
+ s->handle_bad_blocks = 1;
+ atomic_inc(&rdev2->nr_pending);
+ } else
+ clear_bit(R5_MadeGood, &dev->flags);
+ }
+ if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
+ struct md_rdev *rdev2 = rcu_dereference(
+ conf->disks[i].replacement);
+ if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
+ s->handle_bad_blocks = 1;
+ atomic_inc(&rdev2->nr_pending);
+ } else
+ clear_bit(R5_MadeGoodRepl, &dev->flags);
+ }
+ if (!test_bit(R5_Insync, &dev->flags)) {
+ /* The ReadError flag will just be confusing now */
+ clear_bit(R5_ReadError, &dev->flags);
+ clear_bit(R5_ReWrite, &dev->flags);
+ }
+ if (test_bit(R5_ReadError, &dev->flags))
+ clear_bit(R5_Insync, &dev->flags);
+ if (!test_bit(R5_Insync, &dev->flags)) {
+ if (s->failed < 2)
+ s->failed_num[s->failed] = i;
+ s->failed++;
+ if (rdev && !test_bit(Faulty, &rdev->flags))
+ do_recovery = 1;
+ }
+ }
+ if (test_bit(STRIPE_SYNCING, &sh->state)) {
+ /* If there is a failed device being replaced,
+ * we must be recovering.
+ * else if we are after recovery_cp, we must be syncing
+ * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
+ * else we can only be replacing
+ * sync and recovery both need to read all devices, and so
+ * use the same flag.
+ */
+ if (do_recovery ||
+ sh->sector >= conf->mddev->recovery_cp ||
+ test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
+ s->syncing = 1;
+ else
+ s->replacing = 1;
+ }
+ rcu_read_unlock();
+}
+
+static int clear_batch_ready(struct stripe_head *sh)
+{
+ /* Return '1' if this is a member of batch, or
+ * '0' if it is a lone stripe or a head which can now be
+ * handled.
+ */
+ struct stripe_head *tmp;
+ if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
+ return (sh->batch_head && sh->batch_head != sh);
+ spin_lock(&sh->stripe_lock);
+ if (!sh->batch_head) {
+ spin_unlock(&sh->stripe_lock);
+ return 0;
+ }
+
+ /*
+ * this stripe could be added to a batch list before we check
+ * BATCH_READY, skips it
+ */
+ if (sh->batch_head != sh) {
+ spin_unlock(&sh->stripe_lock);
+ return 1;
+ }
+ spin_lock(&sh->batch_lock);
+ list_for_each_entry(tmp, &sh->batch_list, batch_list)
+ clear_bit(STRIPE_BATCH_READY, &tmp->state);
+ spin_unlock(&sh->batch_lock);
+ spin_unlock(&sh->stripe_lock);
+
+ /*
+ * BATCH_READY is cleared, no new stripes can be added.
+ * batch_list can be accessed without lock
+ */
+ return 0;
+}
+
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+ unsigned long handle_flags)
+{
+ struct stripe_head *sh, *next;
+ int i;
+ int do_wakeup = 0;
+
+ list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
+
+ list_del_init(&sh->batch_list);
+
+ WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+ (1 << STRIPE_SYNCING) |
+ (1 << STRIPE_REPLACED) |
+ (1 << STRIPE_PREREAD_ACTIVE) |
+ (1 << STRIPE_DELAYED) |
+ (1 << STRIPE_BIT_DELAY) |
+ (1 << STRIPE_FULL_WRITE) |
+ (1 << STRIPE_BIOFILL_RUN) |
+ (1 << STRIPE_COMPUTE_RUN) |
+ (1 << STRIPE_OPS_REQ_PENDING) |
+ (1 << STRIPE_DISCARD) |
+ (1 << STRIPE_BATCH_READY) |
+ (1 << STRIPE_BATCH_ERR) |
+ (1 << STRIPE_BITMAP_PENDING)));
+ WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+ (1 << STRIPE_REPLACED)));
+
+ set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+ (1 << STRIPE_DEGRADED)),
+ head_sh->state & (1 << STRIPE_INSYNC));
+
+ sh->check_state = head_sh->check_state;
+ sh->reconstruct_state = head_sh->reconstruct_state;
+ for (i = 0; i < sh->disks; i++) {
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+ do_wakeup = 1;
+ sh->dev[i].flags = head_sh->dev[i].flags &
+ (~((1 << R5_WriteError) | (1 << R5_Overlap)));
+ }
+ spin_lock_irq(&sh->stripe_lock);
+ sh->batch_head = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
+ if (handle_flags == 0 ||
+ sh->state & handle_flags)
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+ }
+ spin_lock_irq(&head_sh->stripe_lock);
+ head_sh->batch_head = NULL;
+ spin_unlock_irq(&head_sh->stripe_lock);
+ for (i = 0; i < head_sh->disks; i++)
+ if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+ do_wakeup = 1;
+ if (head_sh->state & handle_flags)
+ set_bit(STRIPE_HANDLE, &head_sh->state);
+
+ if (do_wakeup)
+ wake_up(&head_sh->raid_conf->wait_for_overlap);
+}
+
+static void handle_stripe(struct stripe_head *sh)
+{
+ struct stripe_head_state s;
+ struct r5conf *conf = sh->raid_conf;
+ int i;
+ int prexor;
+ int disks = sh->disks;
+ struct r5dev *pdev, *qdev;
+
+ clear_bit(STRIPE_HANDLE, &sh->state);
+ if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
+ /* already being handled, ensure it gets handled
+ * again when current action finishes */
+ set_bit(STRIPE_HANDLE, &sh->state);
+ return;
+ }
+
+ if (clear_batch_ready(sh) ) {
+ clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
+ return;
+ }
+
+ if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+ break_stripe_batch_list(sh, 0);
+
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
+ spin_lock(&sh->stripe_lock);
+ /* Cannot process 'sync' concurrently with 'discard' */
+ if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+ test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ set_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_INSYNC, &sh->state);
+ clear_bit(STRIPE_REPLACED, &sh->state);
+ }
+ spin_unlock(&sh->stripe_lock);
+ }
+ clear_bit(STRIPE_DELAYED, &sh->state);
+
+ pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
+ "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
+ (unsigned long long)sh->sector, sh->state,
+ atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
+ sh->check_state, sh->reconstruct_state);
+
+ analyse_stripe(sh, &s);
+
+ if (s.handle_bad_blocks) {
+ set_bit(STRIPE_HANDLE, &sh->state);
+ goto finish;
+ }
+
+ if (unlikely(s.blocked_rdev)) {
+ if (s.syncing || s.expanding || s.expanded ||
+ s.replacing || s.to_write || s.written) {
+ set_bit(STRIPE_HANDLE, &sh->state);
+ goto finish;
+ }
+ /* There is nothing for the blocked_rdev to block */
+ rdev_dec_pending(s.blocked_rdev, conf->mddev);
+ s.blocked_rdev = NULL;
+ }
+
+ if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+ set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+ set_bit(STRIPE_BIOFILL_RUN, &sh->state);
+ }
+
+ pr_debug("locked=%d uptodate=%d to_read=%d"
+ " to_write=%d failed=%d failed_num=%d,%d\n",
+ s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
+ s.failed_num[0], s.failed_num[1]);
+ /* check if the array has lost more than max_degraded devices and,
+ * if so, some requests might need to be failed.
+ */
+ if (s.failed > conf->max_degraded) {
+ sh->check_state = 0;
+ sh->reconstruct_state = 0;
+ break_stripe_batch_list(sh, 0);
+ if (s.to_read+s.to_write+s.written)
+ handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+ if (s.syncing + s.replacing)
+ handle_failed_sync(conf, sh, &s);
+ }
+
+ /* Now we check to see if any write operations have recently
+ * completed
+ */
+ prexor = 0;
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
+ prexor = 1;
+ if (sh->reconstruct_state == reconstruct_state_drain_result ||
+ sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
+ sh->reconstruct_state = reconstruct_state_idle;
+
+ /* All the 'written' buffers and the parity block are ready to
+ * be written back to disk
+ */
+ BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+ !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
+ BUG_ON(sh->qd_idx >= 0 &&
+ !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+ !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
+ for (i = disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_LOCKED, &dev->flags) &&
+ (i == sh->pd_idx || i == sh->qd_idx ||
+ dev->written)) {
+ pr_debug("Writing block %d\n", i);
+ set_bit(R5_Wantwrite, &dev->flags);
+ if (prexor)
+ continue;
+ if (s.failed > 1)
+ continue;
+ if (!test_bit(R5_Insync, &dev->flags) ||
+ ((i == sh->pd_idx || i == sh->qd_idx) &&
+ s.failed == 0))
+ set_bit(STRIPE_INSYNC, &sh->state);
+ }
+ }
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ s.dec_preread_active = 1;
+ }
+
+ /*
+ * might be able to return some write requests if the parity blocks
+ * are safe, or on a failed drive
+ */
+ pdev = &sh->dev[sh->pd_idx];
+ s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
+ || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
+ qdev = &sh->dev[sh->qd_idx];
+ s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
+ || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
+ || conf->level < 6;
+
+ if (s.written &&
+ (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
+ && !test_bit(R5_LOCKED, &pdev->flags)
+ && (test_bit(R5_UPTODATE, &pdev->flags) ||
+ test_bit(R5_Discard, &pdev->flags))))) &&
+ (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
+ && !test_bit(R5_LOCKED, &qdev->flags)
+ && (test_bit(R5_UPTODATE, &qdev->flags) ||
+ test_bit(R5_Discard, &qdev->flags))))))
+ handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+
+ /* Now we might consider reading some blocks, either to check/generate
+ * parity, or to satisfy requests
+ * or to load a block that is being partially written.
+ */
+ if (s.to_read || s.non_overwrite
+ || (conf->level == 6 && s.to_write && s.failed)
+ || (s.syncing && (s.uptodate + s.compute < disks))
+ || s.replacing
+ || s.expanding)
+ handle_stripe_fill(sh, &s, disks);
+
+ /* Now to consider new write requests and what else, if anything
+ * should be read. We do not handle new writes when:
+ * 1/ A 'write' operation (copy+xor) is already in flight.
+ * 2/ A 'check' operation is in flight, as it may clobber the parity
+ * block.
+ */
+ if (s.to_write && !sh->reconstruct_state && !sh->check_state)
+ handle_stripe_dirtying(conf, sh, &s, disks);
+
+ /* maybe we need to check and possibly fix the parity for this stripe
+ * Any reads will already have been scheduled, so we just see if enough
+ * data is available. The parity check is held off while parity
+ * dependent operations are in flight.
+ */
+ if (sh->check_state ||
+ (s.syncing && s.locked == 0 &&
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
+ !test_bit(STRIPE_INSYNC, &sh->state))) {
+ if (conf->level == 6)
+ handle_parity_checks6(conf, sh, &s, disks);
+ else
+ handle_parity_checks5(conf, sh, &s, disks);
+ }
+
+ if ((s.replacing || s.syncing) && s.locked == 0
+ && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
+ && !test_bit(STRIPE_REPLACED, &sh->state)) {
+ /* Write out to replacement devices where possible */
+ for (i = 0; i < conf->raid_disks; i++)
+ if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+ WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ set_bit(R5_WantReplace, &sh->dev[i].flags);
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ s.locked++;
+ }
+ if (s.replacing)
+ set_bit(STRIPE_INSYNC, &sh->state);
+ set_bit(STRIPE_REPLACED, &sh->state);
+ }
+ if ((s.syncing || s.replacing) && s.locked == 0 &&
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
+ test_bit(STRIPE_INSYNC, &sh->state)) {
+ md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ clear_bit(STRIPE_SYNCING, &sh->state);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+ wake_up(&conf->wait_for_overlap);
+ }
+
+ /* If the failed drives are just a ReadError, then we might need
+ * to progress the repair/check process
+ */
+ if (s.failed <= conf->max_degraded && !conf->mddev->ro)
+ for (i = 0; i < s.failed; i++) {
+ struct r5dev *dev = &sh->dev[s.failed_num[i]];
+ if (test_bit(R5_ReadError, &dev->flags)
+ && !test_bit(R5_LOCKED, &dev->flags)
+ && test_bit(R5_UPTODATE, &dev->flags)
+ ) {
+ if (!test_bit(R5_ReWrite, &dev->flags)) {
+ set_bit(R5_Wantwrite, &dev->flags);
+ set_bit(R5_ReWrite, &dev->flags);
+ set_bit(R5_LOCKED, &dev->flags);
+ s.locked++;
+ } else {
+ /* let's read it back */
+ set_bit(R5_Wantread, &dev->flags);
+ set_bit(R5_LOCKED, &dev->flags);
+ s.locked++;
+ }
+ }
+ }
+
+ /* Finish reconstruct operations initiated by the expansion process */
+ if (sh->reconstruct_state == reconstruct_state_result) {
+ struct stripe_head *sh_src
+ = get_active_stripe(conf, sh->sector, 1, 1, 1);
+ if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
+ /* sh cannot be written until sh_src has been read.
+ * so arrange for sh to be delayed a little
+ */
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+ &sh_src->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe(sh_src);
+ goto finish;
+ }
+ if (sh_src)
+ release_stripe(sh_src);
+
+ sh->reconstruct_state = reconstruct_state_idle;
+ clear_bit(STRIPE_EXPANDING, &sh->state);
+ for (i = conf->raid_disks; i--; ) {
+ set_bit(R5_Wantwrite, &sh->dev[i].flags);
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ s.locked++;
+ }
+ }
+
+ if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+ !sh->reconstruct_state) {
+ /* Need to write out all blocks after computing parity */
+ sh->disks = conf->raid_disks;
+ stripe_set_idx(sh->sector, conf, 0, sh);
+ schedule_reconstruction(sh, &s, 1, 1);
+ } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
+ clear_bit(STRIPE_EXPAND_READY, &sh->state);
+ atomic_dec(&conf->reshape_stripes);
+ wake_up(&conf->wait_for_overlap);
+ md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ }
+
+ if (s.expanding && s.locked == 0 &&
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
+ handle_stripe_expansion(conf, sh);
+
+finish:
+ /* wait for this device to become unblocked */
+ if (unlikely(s.blocked_rdev)) {
+ if (conf->mddev->external)
+ md_wait_for_blocked_rdev(s.blocked_rdev,
+ conf->mddev);
+ else
+ /* Internal metadata will immediately
+ * be written by raid5d, so we don't
+ * need to wait here.
+ */
+ rdev_dec_pending(s.blocked_rdev,
+ conf->mddev);
+ }
+
+ if (s.handle_bad_blocks)
+ for (i = disks; i--; ) {
+ struct md_rdev *rdev;
+ struct r5dev *dev = &sh->dev[i];
+ if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
+ /* We own a safe reference to the rdev */
+ rdev = conf->disks[i].rdev;
+ if (!rdev_set_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0))
+ md_error(conf->mddev, rdev);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
+ rdev = conf->disks[i].rdev;
+ rdev_clear_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
+ rdev = conf->disks[i].replacement;
+ if (!rdev)
+ /* rdev have been moved down */
+ rdev = conf->disks[i].rdev;
+ rdev_clear_badblocks(rdev, sh->sector,
+ STRIPE_SECTORS, 0);
+ rdev_dec_pending(rdev, conf->mddev);
+ }
+ }
+
+ if (s.ops_request)
+ raid_run_ops(sh, s.ops_request);
+
+ ops_run_io(sh, &s);
+
+ if (s.dec_preread_active) {
+ /* We delay this until after ops_run_io so that if make_request
+ * is waiting on a flush, it won't continue until the writes
+ * have actually been submitted.
+ */
+ atomic_dec(&conf->preread_active_stripes);
+ if (atomic_read(&conf->preread_active_stripes) <
+ IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ }
+
+ return_io(s.return_bi);
+
+ clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
+}
+
+static void raid5_activate_delayed(struct r5conf *conf)
+{
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
+ while (!list_empty(&conf->delayed_list)) {
+ struct list_head *l = conf->delayed_list.next;
+ struct stripe_head *sh;
+ sh = list_entry(l, struct stripe_head, lru);
+ list_del_init(l);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ list_add_tail(&sh->lru, &conf->hold_list);
+ raid5_wakeup_stripe_thread(sh);
+ }
+ }
+}
+
+static void activate_bit_delay(struct r5conf *conf,
+ struct list_head *temp_inactive_list)
+{
+ /* device_lock is held */
+ struct list_head head;
+ list_add(&head, &conf->bitmap_list);
+ list_del_init(&conf->bitmap_list);
+ while (!list_empty(&head)) {
+ struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+ int hash;
+ list_del_init(&sh->lru);
+ atomic_inc(&sh->count);
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &temp_inactive_list[hash]);
+ }
+}
+
+static int raid5_congested(struct mddev *mddev, int bits)
+{
+ struct r5conf *conf = mddev->private;
+
+ /* No difference between reads and writes. Just check
+ * how busy the stripe_cache is
+ */
+
+ if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+ return 1;
+ if (conf->quiesce)
+ return 1;
+ if (atomic_read(&conf->empty_inactive_list_nr))
+ return 1;
+
+ return 0;
+}
+
+/* We want read requests to align with chunks where possible,
+ * but write requests don't need to.
+ */
+static int raid5_mergeable_bvec(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ int max;
+ unsigned int chunk_sectors = mddev->chunk_sectors;
+ unsigned int bio_sectors = bvm->bi_size >> 9;
+
+ /*
+ * always allow writes to be mergeable, read as well if array
+ * is degraded as we'll go through stripe cache anyway.
+ */
+ if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
+ return biovec->bv_len;
+
+ if (mddev->new_chunk_sectors < mddev->chunk_sectors)
+ chunk_sectors = mddev->new_chunk_sectors;
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+ if (max < 0) max = 0;
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ else
+ return max;
+}
+
+static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
+{
+ sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
+ unsigned int chunk_sectors = mddev->chunk_sectors;
+ unsigned int bio_sectors = bio_sectors(bio);
+
+ if (mddev->new_chunk_sectors < mddev->chunk_sectors)
+ chunk_sectors = mddev->new_chunk_sectors;
+ return chunk_sectors >=
+ ((sector & (chunk_sectors - 1)) + bio_sectors);
+}
+
+/*
+ * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
+ * later sampled by raid5d.
+ */
+static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+
+ bi->bi_next = conf->retry_read_aligned_list;
+ conf->retry_read_aligned_list = bi;
+
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ md_wakeup_thread(conf->mddev->thread);
+}
+
+static struct bio *remove_bio_from_retry(struct r5conf *conf)
+{
+ struct bio *bi;
+
+ bi = conf->retry_read_aligned;
+ if (bi) {
+ conf->retry_read_aligned = NULL;
+ return bi;
+ }
+ bi = conf->retry_read_aligned_list;
+ if(bi) {
+ conf->retry_read_aligned_list = bi->bi_next;
+ bi->bi_next = NULL;
+ /*
+ * this sets the active strip count to 1 and the processed
+ * strip count to zero (upper 8 bits)
+ */
+ raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+ }
+
+ return bi;
+}
+
+/*
+ * The "raid5_align_endio" should check if the read succeeded and if it
+ * did, call bio_endio on the original bio (having bio_put the new bio
+ * first).
+ * If the read failed..
+ */
+static void raid5_align_endio(struct bio *bi, int error)
+{
+ struct bio* raid_bi = bi->bi_private;
+ struct mddev *mddev;
+ struct r5conf *conf;
+ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ struct md_rdev *rdev;
+
+ bio_put(bi);
+
+ rdev = (void*)raid_bi->bi_next;
+ raid_bi->bi_next = NULL;
+ mddev = rdev->mddev;
+ conf = mddev->private;
+
+ rdev_dec_pending(rdev, conf->mddev);
+
+ if (!error && uptodate) {
+ trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+ raid_bi, 0);
+ bio_endio(raid_bi, 0);
+ if (atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_stripe);
+ return;
+ }
+
+ pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
+
+ add_bio_to_retry(raid_bi, conf);
+}
+
+static int bio_fits_rdev(struct bio *bi)
+{
+ struct request_queue *q = bdev_get_queue(bi->bi_bdev);
+
+ if (bio_sectors(bi) > queue_max_sectors(q))
+ return 0;
+ blk_recount_segments(q, bi);
+ if (bi->bi_phys_segments > queue_max_segments(q))
+ return 0;
+
+ if (q->merge_bvec_fn)
+ /* it's too hard to apply the merge_bvec_fn at this stage,
+ * just just give up
+ */
+ return 0;
+
+ return 1;
+}
+
+static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
+{
+ struct r5conf *conf = mddev->private;
+ int dd_idx;
+ struct bio* align_bi;
+ struct md_rdev *rdev;
+ sector_t end_sector;
+
+ if (!in_chunk_boundary(mddev, raid_bio)) {
+ pr_debug("chunk_aligned_read : non aligned\n");
+ return 0;
+ }
+ /*
+ * use bio_clone_mddev to make a copy of the bio
+ */
+ align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
+ if (!align_bi)
+ return 0;
+ /*
+ * set bi_end_io to a new function, and set bi_private to the
+ * original bio.
+ */
+ align_bi->bi_end_io = raid5_align_endio;
+ align_bi->bi_private = raid_bio;
+ /*
+ * compute position
+ */
+ align_bi->bi_iter.bi_sector =
+ raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
+ 0, &dd_idx, NULL);
+
+ end_sector = bio_end_sector(align_bi);
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->disks[dd_idx].replacement);
+ if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ rdev->recovery_offset < end_sector) {
+ rdev = rcu_dereference(conf->disks[dd_idx].rdev);
+ if (rdev &&
+ (test_bit(Faulty, &rdev->flags) ||
+ !(test_bit(In_sync, &rdev->flags) ||
+ rdev->recovery_offset >= end_sector)))
+ rdev = NULL;
+ }
+ if (rdev) {
+ sector_t first_bad;
+ int bad_sectors;
+
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ raid_bio->bi_next = (void*)rdev;
+ align_bi->bi_bdev = rdev->bdev;
+ __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags);
+
+ if (!bio_fits_rdev(align_bi) ||
+ is_badblock(rdev, align_bi->bi_iter.bi_sector,
+ bio_sectors(align_bi),
+ &first_bad, &bad_sectors)) {
+ /* too big in some way, or has a known bad block */
+ bio_put(align_bi);
+ rdev_dec_pending(rdev, mddev);
+ return 0;
+ }
+
+ /* No reshape active, so we can trust rdev->data_offset */
+ align_bi->bi_iter.bi_sector += rdev->data_offset;
+
+ spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_for_stripe,
+ conf->quiesce == 0,
+ conf->device_lock);
+ atomic_inc(&conf->active_aligned_reads);
+ spin_unlock_irq(&conf->device_lock);
+
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+ align_bi, disk_devt(mddev->gendisk),
+ raid_bio->bi_iter.bi_sector);
+ generic_make_request(align_bi);
+ return 1;
+ } else {
+ rcu_read_unlock();
+ bio_put(align_bi);
+ return 0;
+ }
+}
+
+/* __get_priority_stripe - get the next stripe to process
+ *
+ * Full stripe writes are allowed to pass preread active stripes up until
+ * the bypass_threshold is exceeded. In general the bypass_count
+ * increments when the handle_list is handled before the hold_list; however, it
+ * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
+ * stripe with in flight i/o. The bypass_count will be reset when the
+ * head of the hold_list has changed, i.e. the head was promoted to the
+ * handle_list.
+ */
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
+{
+ struct stripe_head *sh = NULL, *tmp;
+ struct list_head *handle_list = NULL;
+ struct r5worker_group *wg = NULL;
+
+ if (conf->worker_cnt_per_group == 0) {
+ handle_list = &conf->handle_list;
+ } else if (group != ANY_GROUP) {
+ handle_list = &conf->worker_groups[group].handle_list;
+ wg = &conf->worker_groups[group];
+ } else {
+ int i;
+ for (i = 0; i < conf->group_cnt; i++) {
+ handle_list = &conf->worker_groups[i].handle_list;
+ wg = &conf->worker_groups[i];
+ if (!list_empty(handle_list))
+ break;
+ }
+ }
+
+ pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
+ __func__,
+ list_empty(handle_list) ? "empty" : "busy",
+ list_empty(&conf->hold_list) ? "empty" : "busy",
+ atomic_read(&conf->pending_full_writes), conf->bypass_count);
+
+ if (!list_empty(handle_list)) {
+ sh = list_entry(handle_list->next, typeof(*sh), lru);
+
+ if (list_empty(&conf->hold_list))
+ conf->bypass_count = 0;
+ else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
+ if (conf->hold_list.next == conf->last_hold)
+ conf->bypass_count++;
+ else {
+ conf->last_hold = conf->hold_list.next;
+ conf->bypass_count -= conf->bypass_threshold;
+ if (conf->bypass_count < 0)
+ conf->bypass_count = 0;
+ }
+ }
+ } else if (!list_empty(&conf->hold_list) &&
+ ((conf->bypass_threshold &&
+ conf->bypass_count > conf->bypass_threshold) ||
+ atomic_read(&conf->pending_full_writes) == 0)) {
+
+ list_for_each_entry(tmp, &conf->hold_list, lru) {
+ if (conf->worker_cnt_per_group == 0 ||
+ group == ANY_GROUP ||
+ !cpu_online(tmp->cpu) ||
+ cpu_to_group(tmp->cpu) == group) {
+ sh = tmp;
+ break;
+ }
+ }
+
+ if (sh) {
+ conf->bypass_count -= conf->bypass_threshold;
+ if (conf->bypass_count < 0)
+ conf->bypass_count = 0;
+ }
+ wg = NULL;
+ }
+
+ if (!sh)
+ return NULL;
+
+ if (wg) {
+ wg->stripes_cnt--;
+ sh->group = NULL;
+ }
+ list_del_init(&sh->lru);
+ BUG_ON(atomic_inc_return(&sh->count) != 1);
+ return sh;
+}
+
+struct raid5_plug_cb {
+ struct blk_plug_cb cb;
+ struct list_head list;
+ struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
+};
+
+static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
+{
+ struct raid5_plug_cb *cb = container_of(
+ blk_cb, struct raid5_plug_cb, cb);
+ struct stripe_head *sh;
+ struct mddev *mddev = cb->cb.data;
+ struct r5conf *conf = mddev->private;
+ int cnt = 0;
+ int hash;
+
+ if (cb->list.next && !list_empty(&cb->list)) {
+ spin_lock_irq(&conf->device_lock);
+ while (!list_empty(&cb->list)) {
+ sh = list_first_entry(&cb->list, struct stripe_head, lru);
+ list_del_init(&sh->lru);
+ /*
+ * avoid race release_stripe_plug() sees
+ * STRIPE_ON_UNPLUG_LIST clear but the stripe
+ * is still in our list
+ */
+ smp_mb__before_atomic();
+ clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
+ /*
+ * STRIPE_ON_RELEASE_LIST could be set here. In that
+ * case, the count is always > 1 here
+ */
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
+ cnt++;
+ }
+ spin_unlock_irq(&conf->device_lock);
+ }
+ release_inactive_stripe_list(conf, cb->temp_inactive_list,
+ NR_STRIPE_HASH_LOCKS);
+ if (mddev->queue)
+ trace_block_unplug(mddev->queue, cnt, !from_schedule);
+ kfree(cb);
+}
+
+static void release_stripe_plug(struct mddev *mddev,
+ struct stripe_head *sh)
+{
+ struct blk_plug_cb *blk_cb = blk_check_plugged(
+ raid5_unplug, mddev,
+ sizeof(struct raid5_plug_cb));
+ struct raid5_plug_cb *cb;
+
+ if (!blk_cb) {
+ release_stripe(sh);
+ return;
+ }
+
+ cb = container_of(blk_cb, struct raid5_plug_cb, cb);
+
+ if (cb->list.next == NULL) {
+ int i;
+ INIT_LIST_HEAD(&cb->list);
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(cb->temp_inactive_list + i);
+ }
+
+ if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
+ list_add_tail(&sh->lru, &cb->list);
+ else
+ release_stripe(sh);
+}
+
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t logical_sector, last_sector;
+ struct stripe_head *sh;
+ int remaining;
+ int stripe_sectors;
+
+ if (mddev->reshape_position != MaxSector)
+ /* Skip discard while reshape is happening */
+ return;
+
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
+
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+ stripe_sectors = conf->chunk_sectors *
+ (conf->raid_disks - conf->max_degraded);
+ logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+ stripe_sectors);
+ sector_div(last_sector, stripe_sectors);
+
+ logical_sector *= conf->chunk_sectors;
+ last_sector *= conf->chunk_sectors;
+
+ for (; logical_sector < last_sector;
+ logical_sector += STRIPE_SECTORS) {
+ DEFINE_WAIT(w);
+ int d;
+ again:
+ sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+ prepare_to_wait(&conf->wait_for_overlap, &w,
+ TASK_UNINTERRUPTIBLE);
+ set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
+ if (test_bit(STRIPE_SYNCING, &sh->state)) {
+ release_stripe(sh);
+ schedule();
+ goto again;
+ }
+ clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
+ spin_lock_irq(&sh->stripe_lock);
+ for (d = 0; d < conf->raid_disks; d++) {
+ if (d == sh->pd_idx || d == sh->qd_idx)
+ continue;
+ if (sh->dev[d].towrite || sh->dev[d].toread) {
+ set_bit(R5_Overlap, &sh->dev[d].flags);
+ spin_unlock_irq(&sh->stripe_lock);
+ release_stripe(sh);
+ schedule();
+ goto again;
+ }
+ }
+ set_bit(STRIPE_DISCARD, &sh->state);
+ finish_wait(&conf->wait_for_overlap, &w);
+ sh->overwrite_disks = 0;
+ for (d = 0; d < conf->raid_disks; d++) {
+ if (d == sh->pd_idx || d == sh->qd_idx)
+ continue;
+ sh->dev[d].towrite = bi;
+ set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+ raid5_inc_bi_active_stripes(bi);
+ sh->overwrite_disks++;
+ }
+ spin_unlock_irq(&sh->stripe_lock);
+ if (conf->mddev->bitmap) {
+ for (d = 0;
+ d < conf->raid_disks - conf->max_degraded;
+ d++)
+ bitmap_startwrite(mddev->bitmap,
+ sh->sector,
+ STRIPE_SECTORS,
+ 0);
+ sh->bm_seq = conf->seq_flush + 1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe_plug(mddev, sh);
+ }
+
+ remaining = raid5_dec_bi_active_stripes(bi);
+ if (remaining == 0) {
+ md_write_end(mddev);
+ bio_endio(bi, 0);
+ }
+}
+
+static void make_request(struct mddev *mddev, struct bio * bi)
+{
+ struct r5conf *conf = mddev->private;
+ int dd_idx;
+ sector_t new_sector;
+ sector_t logical_sector, last_sector;
+ struct stripe_head *sh;
+ const int rw = bio_data_dir(bi);
+ int remaining;
+ DEFINE_WAIT(w);
+ bool do_prepare;
+
+ if (unlikely(bi->bi_rw & REQ_FLUSH)) {
+ md_flush_request(mddev, bi);
+ return;
+ }
+
+ md_write_start(mddev, bi);
+
+ /*
+ * If array is degraded, better not do chunk aligned read because
+ * later we might have to read it again in order to reconstruct
+ * data on failed drives.
+ */
+ if (rw == READ && mddev->degraded == 0 &&
+ mddev->reshape_position == MaxSector &&
+ chunk_aligned_read(mddev,bi))
+ return;
+
+ if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+ make_discard_request(mddev, bi);
+ return;
+ }
+
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ last_sector = bio_end_sector(bi);
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+ prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+ for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+ int previous;
+ int seq;
+
+ do_prepare = false;
+ retry:
+ seq = read_seqcount_begin(&conf->gen_lock);
+ previous = 0;
+ if (do_prepare)
+ prepare_to_wait(&conf->wait_for_overlap, &w,
+ TASK_UNINTERRUPTIBLE);
+ if (unlikely(conf->reshape_progress != MaxSector)) {
+ /* spinlock is needed as reshape_progress may be
+ * 64bit on a 32bit platform, and so it might be
+ * possible to see a half-updated value
+ * Of course reshape_progress could change after
+ * the lock is dropped, so once we get a reference
+ * to the stripe that we think it is, we will have
+ * to check again.
+ */
+ spin_lock_irq(&conf->device_lock);
+ if (mddev->reshape_backwards
+ ? logical_sector < conf->reshape_progress
+ : logical_sector >= conf->reshape_progress) {
+ previous = 1;
+ } else {
+ if (mddev->reshape_backwards
+ ? logical_sector < conf->reshape_safe
+ : logical_sector >= conf->reshape_safe) {
+ spin_unlock_irq(&conf->device_lock);
+ schedule();
+ do_prepare = true;
+ goto retry;
+ }
+ }
+ spin_unlock_irq(&conf->device_lock);
+ }
+
+ new_sector = raid5_compute_sector(conf, logical_sector,
+ previous,
+ &dd_idx, NULL);
+ pr_debug("raid456: make_request, sector %llu logical %llu\n",
+ (unsigned long long)new_sector,
+ (unsigned long long)logical_sector);
+
+ sh = get_active_stripe(conf, new_sector, previous,
+ (bi->bi_rw&RWA_MASK), 0);
+ if (sh) {
+ if (unlikely(previous)) {
+ /* expansion might have moved on while waiting for a
+ * stripe, so we must do the range check again.
+ * Expansion could still move past after this
+ * test, but as we are holding a reference to
+ * 'sh', we know that if that happens,
+ * STRIPE_EXPANDING will get set and the expansion
+ * won't proceed until we finish with the stripe.
+ */
+ int must_retry = 0;
+ spin_lock_irq(&conf->device_lock);
+ if (mddev->reshape_backwards
+ ? logical_sector >= conf->reshape_progress
+ : logical_sector < conf->reshape_progress)
+ /* mismatch, need to try again */
+ must_retry = 1;
+ spin_unlock_irq(&conf->device_lock);
+ if (must_retry) {
+ release_stripe(sh);
+ schedule();
+ do_prepare = true;
+ goto retry;
+ }
+ }
+ if (read_seqcount_retry(&conf->gen_lock, seq)) {
+ /* Might have got the wrong stripe_head
+ * by accident
+ */
+ release_stripe(sh);
+ goto retry;
+ }
+
+ if (rw == WRITE &&
+ logical_sector >= mddev->suspend_lo &&
+ logical_sector < mddev->suspend_hi) {
+ release_stripe(sh);
+ /* As the suspend_* range is controlled by
+ * userspace, we want an interruptible
+ * wait.
+ */
+ flush_signals(current);
+ prepare_to_wait(&conf->wait_for_overlap,
+ &w, TASK_INTERRUPTIBLE);
+ if (logical_sector >= mddev->suspend_lo &&
+ logical_sector < mddev->suspend_hi) {
+ schedule();
+ do_prepare = true;
+ }
+ goto retry;
+ }
+
+ if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+ !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
+ /* Stripe is busy expanding or
+ * add failed due to overlap. Flush everything
+ * and wait a while
+ */
+ md_wakeup_thread(mddev->thread);
+ release_stripe(sh);
+ schedule();
+ do_prepare = true;
+ goto retry;
+ }
+ set_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if ((!sh->batch_head || sh == sh->batch_head) &&
+ (bi->bi_rw & REQ_SYNC) &&
+ !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe_plug(mddev, sh);
+ } else {
+ /* cannot get stripe for read-ahead, just give-up */
+ clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ break;
+ }
+ }
+ finish_wait(&conf->wait_for_overlap, &w);
+
+ remaining = raid5_dec_bi_active_stripes(bi);
+ if (remaining == 0) {
+
+ if ( rw == WRITE )
+ md_write_end(mddev);
+
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
+ bio_endio(bi, 0);
+ }
+}
+
+static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
+
+static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
+{
+ /* reshaping is quite different to recovery/resync so it is
+ * handled quite separately ... here.
+ *
+ * On each call to sync_request, we gather one chunk worth of
+ * destination stripes and flag them as expanding.
+ * Then we find all the source stripes and request reads.
+ * As the reads complete, handle_stripe will copy the data
+ * into the destination stripe and release that stripe.
+ */
+ struct r5conf *conf = mddev->private;
+ struct stripe_head *sh;
+ sector_t first_sector, last_sector;
+ int raid_disks = conf->previous_raid_disks;
+ int data_disks = raid_disks - conf->max_degraded;
+ int new_data_disks = conf->raid_disks - conf->max_degraded;
+ int i;
+ int dd_idx;
+ sector_t writepos, readpos, safepos;
+ sector_t stripe_addr;
+ int reshape_sectors;
+ struct list_head stripes;
+
+ if (sector_nr == 0) {
+ /* If restarting in the middle, skip the initial sectors */
+ if (mddev->reshape_backwards &&
+ conf->reshape_progress < raid5_size(mddev, 0, 0)) {
+ sector_nr = raid5_size(mddev, 0, 0)
+ - conf->reshape_progress;
+ } else if (!mddev->reshape_backwards &&
+ conf->reshape_progress > 0)
+ sector_nr = conf->reshape_progress;
+ sector_div(sector_nr, new_data_disks);
+ if (sector_nr) {
+ mddev->curr_resync_completed = sector_nr;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ *skipped = 1;
+ return sector_nr;
+ }
+ }
+
+ /* We need to process a full chunk at a time.
+ * If old and new chunk sizes differ, we need to process the
+ * largest of these
+ */
+ if (mddev->new_chunk_sectors > mddev->chunk_sectors)
+ reshape_sectors = mddev->new_chunk_sectors;
+ else
+ reshape_sectors = mddev->chunk_sectors;
+
+ /* We update the metadata at least every 10 seconds, or when
+ * the data about to be copied would over-write the source of
+ * the data at the front of the range. i.e. one new_stripe
+ * along from reshape_progress new_maps to after where
+ * reshape_safe old_maps to
+ */
+ writepos = conf->reshape_progress;
+ sector_div(writepos, new_data_disks);
+ readpos = conf->reshape_progress;
+ sector_div(readpos, data_disks);
+ safepos = conf->reshape_safe;
+ sector_div(safepos, data_disks);
+ if (mddev->reshape_backwards) {
+ writepos -= min_t(sector_t, reshape_sectors, writepos);
+ readpos += reshape_sectors;
+ safepos += reshape_sectors;
+ } else {
+ writepos += reshape_sectors;
+ readpos -= min_t(sector_t, reshape_sectors, readpos);
+ safepos -= min_t(sector_t, reshape_sectors, safepos);
+ }
+
+ /* Having calculated the 'writepos' possibly use it
+ * to set 'stripe_addr' which is where we will write to.
+ */
+ if (mddev->reshape_backwards) {
+ BUG_ON(conf->reshape_progress == 0);
+ stripe_addr = writepos;
+ BUG_ON((mddev->dev_sectors &
+ ~((sector_t)reshape_sectors - 1))
+ - reshape_sectors - stripe_addr
+ != sector_nr);
+ } else {
+ BUG_ON(writepos != sector_nr + reshape_sectors);
+ stripe_addr = sector_nr;
+ }
+
+ /* 'writepos' is the most advanced device address we might write.
+ * 'readpos' is the least advanced device address we might read.
+ * 'safepos' is the least address recorded in the metadata as having
+ * been reshaped.
+ * If there is a min_offset_diff, these are adjusted either by
+ * increasing the safepos/readpos if diff is negative, or
+ * increasing writepos if diff is positive.
+ * If 'readpos' is then behind 'writepos', there is no way that we can
+ * ensure safety in the face of a crash - that must be done by userspace
+ * making a backup of the data. So in that case there is no particular
+ * rush to update metadata.
+ * Otherwise if 'safepos' is behind 'writepos', then we really need to
+ * update the metadata to advance 'safepos' to match 'readpos' so that
+ * we can be safe in the event of a crash.
+ * So we insist on updating metadata if safepos is behind writepos and
+ * readpos is beyond writepos.
+ * In any case, update the metadata every 10 seconds.
+ * Maybe that number should be configurable, but I'm not sure it is
+ * worth it.... maybe it could be a multiple of safemode_delay???
+ */
+ if (conf->min_offset_diff < 0) {
+ safepos += -conf->min_offset_diff;
+ readpos += -conf->min_offset_diff;
+ } else
+ writepos += conf->min_offset_diff;
+
+ if ((mddev->reshape_backwards
+ ? (safepos > writepos && readpos < writepos)
+ : (safepos < writepos && readpos > writepos)) ||
+ time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
+ /* Cannot proceed until we've updated the superblock... */
+ wait_event(conf->wait_for_overlap,
+ atomic_read(&conf->reshape_stripes)==0
+ || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (atomic_read(&conf->reshape_stripes) != 0)
+ return 0;
+ mddev->reshape_position = conf->reshape_progress;
+ mddev->curr_resync_completed = sector_nr;
+ conf->reshape_checkpoint = jiffies;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ wait_event(mddev->sb_wait, mddev->flags == 0 ||
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ return 0;
+ spin_lock_irq(&conf->device_lock);
+ conf->reshape_safe = mddev->reshape_position;
+ spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_for_overlap);
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ }
+
+ INIT_LIST_HEAD(&stripes);
+ for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
+ int j;
+ int skipped_disk = 0;
+ sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
+ set_bit(STRIPE_EXPANDING, &sh->state);
+ atomic_inc(&conf->reshape_stripes);
+ /* If any of this stripe is beyond the end of the old
+ * array, then we need to zero those blocks
+ */
+ for (j=sh->disks; j--;) {
+ sector_t s;
+ if (j == sh->pd_idx)
+ continue;
+ if (conf->level == 6 &&
+ j == sh->qd_idx)
+ continue;
+ s = compute_blocknr(sh, j, 0);
+ if (s < raid5_size(mddev, 0, 0)) {
+ skipped_disk = 1;
+ continue;
+ }
+ memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+ set_bit(R5_Expanded, &sh->dev[j].flags);
+ set_bit(R5_UPTODATE, &sh->dev[j].flags);
+ }
+ if (!skipped_disk) {
+ set_bit(STRIPE_EXPAND_READY, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ list_add(&sh->lru, &stripes);
+ }
+ spin_lock_irq(&conf->device_lock);
+ if (mddev->reshape_backwards)
+ conf->reshape_progress -= reshape_sectors * new_data_disks;
+ else
+ conf->reshape_progress += reshape_sectors * new_data_disks;
+ spin_unlock_irq(&conf->device_lock);
+ /* Ok, those stripe are ready. We can start scheduling
+ * reads on the source stripes.
+ * The source stripes are determined by mapping the first and last
+ * block on the destination stripes.
+ */
+ first_sector =
+ raid5_compute_sector(conf, stripe_addr*(new_data_disks),
+ 1, &dd_idx, NULL);
+ last_sector =
+ raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
+ * new_data_disks - 1),
+ 1, &dd_idx, NULL);
+ if (last_sector >= mddev->dev_sectors)
+ last_sector = mddev->dev_sectors - 1;
+ while (first_sector <= last_sector) {
+ sh = get_active_stripe(conf, first_sector, 1, 0, 1);
+ set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+ first_sector += STRIPE_SECTORS;
+ }
+ /* Now that the sources are clearly marked, we can release
+ * the destination stripes
+ */
+ while (!list_empty(&stripes)) {
+ sh = list_entry(stripes.next, struct stripe_head, lru);
+ list_del_init(&sh->lru);
+ release_stripe(sh);
+ }
+ /* If this takes us to the resync_max point where we have to pause,
+ * then we need to write out the superblock.
+ */
+ sector_nr += reshape_sectors;
+ if ((sector_nr - mddev->curr_resync_completed) * 2
+ >= mddev->resync_max - mddev->curr_resync_completed) {
+ /* Cannot proceed until we've updated the superblock... */
+ wait_event(conf->wait_for_overlap,
+ atomic_read(&conf->reshape_stripes) == 0
+ || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (atomic_read(&conf->reshape_stripes) != 0)
+ goto ret;
+ mddev->reshape_position = conf->reshape_progress;
+ mddev->curr_resync_completed = sector_nr;
+ conf->reshape_checkpoint = jiffies;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+ || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ goto ret;
+ spin_lock_irq(&conf->device_lock);
+ conf->reshape_safe = mddev->reshape_position;
+ spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_for_overlap);
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ }
+ret:
+ return reshape_sectors;
+}
+
+static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
+{
+ struct r5conf *conf = mddev->private;
+ struct stripe_head *sh;
+ sector_t max_sector = mddev->dev_sectors;
+ sector_t sync_blocks;
+ int still_degraded = 0;
+ int i;
+
+ if (sector_nr >= max_sector) {
+ /* just being told to finish up .. nothing much to do */
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+ end_reshape(conf);
+ return 0;
+ }
+
+ if (mddev->curr_resync < max_sector) /* aborted */
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else /* completed sync */
+ conf->fullsync = 0;
+ bitmap_close_sync(mddev->bitmap);
+
+ return 0;
+ }
+
+ /* Allow raid5_quiesce to complete */
+ wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ return reshape_request(mddev, sector_nr, skipped);
+
+ /* No need to check resync_max as we never do more than one
+ * stripe, and as resync_max will always be on a chunk boundary,
+ * if the check in md_do_sync didn't fire, there is no chance
+ * of overstepping resync_max here
+ */
+
+ /* if there is too many failed drives and we are trying
+ * to resync, then assert that we are finished, because there is
+ * nothing we can do.
+ */
+ if (mddev->degraded >= conf->max_degraded &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ sector_t rv = mddev->dev_sectors - sector_nr;
+ *skipped = 1;
+ return rv;
+ }
+ if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ !conf->fullsync &&
+ !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ sync_blocks >= STRIPE_SECTORS) {
+ /* we can skip this block, and probably more */
+ sync_blocks /= STRIPE_SECTORS;
+ *skipped = 1;
+ return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+ }
+
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+
+ sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
+ if (sh == NULL) {
+ sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
+ /* make sure we don't swamp the stripe cache if someone else
+ * is trying to get access
+ */
+ schedule_timeout_uninterruptible(1);
+ }
+ /* Need to check if array will still be degraded after recovery/resync
+ * Note in case of > 1 drive failures it's possible we're rebuilding
+ * one drive while leaving another faulty drive in array.
+ */
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
+
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
+ still_degraded = 1;
+ }
+ rcu_read_unlock();
+
+ bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
+
+ set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+
+ release_stripe(sh);
+
+ return STRIPE_SECTORS;
+}
+
+static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+{
+ /* We may not be able to submit a whole bio at once as there
+ * may not be enough stripe_heads available.
+ * We cannot pre-allocate enough stripe_heads as we may need
+ * more than exist in the cache (if we allow ever large chunks).
+ * So we do one stripe head at a time and record in
+ * ->bi_hw_segments how many have been done.
+ *
+ * We *know* that this entire raid_bio is in one chunk, so
+ * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
+ */
+ struct stripe_head *sh;
+ int dd_idx;
+ sector_t sector, logical_sector, last_sector;
+ int scnt = 0;
+ int remaining;
+ int handled = 0;
+
+ logical_sector = raid_bio->bi_iter.bi_sector &
+ ~((sector_t)STRIPE_SECTORS-1);
+ sector = raid5_compute_sector(conf, logical_sector,
+ 0, &dd_idx, NULL);
+ last_sector = bio_end_sector(raid_bio);
+
+ for (; logical_sector < last_sector;
+ logical_sector += STRIPE_SECTORS,
+ sector += STRIPE_SECTORS,
+ scnt++) {
+
+ if (scnt < raid5_bi_processed_stripes(raid_bio))
+ /* already done this stripe */
+ continue;
+
+ sh = get_active_stripe(conf, sector, 0, 1, 1);
+
+ if (!sh) {
+ /* failed to get a stripe - must wait */
+ raid5_set_bi_processed_stripes(raid_bio, scnt);
+ conf->retry_read_aligned = raid_bio;
+ return handled;
+ }
+
+ if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
+ release_stripe(sh);
+ raid5_set_bi_processed_stripes(raid_bio, scnt);
+ conf->retry_read_aligned = raid_bio;
+ return handled;
+ }
+
+ set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
+ handle_stripe(sh);
+ release_stripe(sh);
+ handled++;
+ }
+ remaining = raid5_dec_bi_active_stripes(raid_bio);
+ if (remaining == 0) {
+ trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+ raid_bio, 0);
+ bio_endio(raid_bio, 0);
+ }
+ if (atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_stripe);
+ return handled;
+}
+
+static int handle_active_stripes(struct r5conf *conf, int group,
+ struct r5worker *worker,
+ struct list_head *temp_inactive_list)
+{
+ struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
+ int i, batch_size = 0, hash;
+ bool release_inactive = false;
+
+ while (batch_size < MAX_STRIPE_BATCH &&
+ (sh = __get_priority_stripe(conf, group)) != NULL)
+ batch[batch_size++] = sh;
+
+ if (batch_size == 0) {
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ if (!list_empty(temp_inactive_list + i))
+ break;
+ if (i == NR_STRIPE_HASH_LOCKS)
+ return batch_size;
+ release_inactive = true;
+ }
+ spin_unlock_irq(&conf->device_lock);
+
+ release_inactive_stripe_list(conf, temp_inactive_list,
+ NR_STRIPE_HASH_LOCKS);
+
+ if (release_inactive) {
+ spin_lock_irq(&conf->device_lock);
+ return 0;
+ }
+
+ for (i = 0; i < batch_size; i++)
+ handle_stripe(batch[i]);
+
+ cond_resched();
+
+ spin_lock_irq(&conf->device_lock);
+ for (i = 0; i < batch_size; i++) {
+ hash = batch[i]->hash_lock_index;
+ __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+ }
+ return batch_size;
+}
+
+static void raid5_do_work(struct work_struct *work)
+{
+ struct r5worker *worker = container_of(work, struct r5worker, work);
+ struct r5worker_group *group = worker->group;
+ struct r5conf *conf = group->conf;
+ int group_id = group - conf->worker_groups;
+ int handled;
+ struct blk_plug plug;
+
+ pr_debug("+++ raid5worker active\n");
+
+ blk_start_plug(&plug);
+ handled = 0;
+ spin_lock_irq(&conf->device_lock);
+ while (1) {
+ int batch_size, released;
+
+ released = release_stripe_list(conf, worker->temp_inactive_list);
+
+ batch_size = handle_active_stripes(conf, group_id, worker,
+ worker->temp_inactive_list);
+ worker->working = false;
+ if (!batch_size && !released)
+ break;
+ handled += batch_size;
+ }
+ pr_debug("%d stripes handled\n", handled);
+
+ spin_unlock_irq(&conf->device_lock);
+ blk_finish_plug(&plug);
+
+ pr_debug("--- raid5worker inactive\n");
+}
+
+/*
+ * This is our raid5 kernel thread.
+ *
+ * We scan the hash table for stripes which can be handled now.
+ * During the scan, completed stripes are saved for us by the interrupt
+ * handler, so that they will not have to wait for our next wakeup.
+ */
+static void raid5d(struct md_thread *thread)
+{
+ struct mddev *mddev = thread->mddev;
+ struct r5conf *conf = mddev->private;
+ int handled;
+ struct blk_plug plug;
+
+ pr_debug("+++ raid5d active\n");
+
+ md_check_recovery(mddev);
+
+ blk_start_plug(&plug);
+ handled = 0;
+ spin_lock_irq(&conf->device_lock);
+ while (1) {
+ struct bio *bio;
+ int batch_size, released;
+
+ released = release_stripe_list(conf, conf->temp_inactive_list);
+ if (released)
+ clear_bit(R5_DID_ALLOC, &conf->cache_state);
+
+ if (
+ !list_empty(&conf->bitmap_list)) {
+ /* Now is a good time to flush some bitmap updates */
+ conf->seq_flush++;
+ spin_unlock_irq(&conf->device_lock);
+ bitmap_unplug(mddev->bitmap);
+ spin_lock_irq(&conf->device_lock);
+ conf->seq_write = conf->seq_flush;
+ activate_bit_delay(conf, conf->temp_inactive_list);
+ }
+ raid5_activate_delayed(conf);
+
+ while ((bio = remove_bio_from_retry(conf))) {
+ int ok;
+ spin_unlock_irq(&conf->device_lock);
+ ok = retry_aligned_read(conf, bio);
+ spin_lock_irq(&conf->device_lock);
+ if (!ok)
+ break;
+ handled++;
+ }
+
+ batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+ conf->temp_inactive_list);
+ if (!batch_size && !released)
+ break;
+ handled += batch_size;
+
+ if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+ spin_unlock_irq(&conf->device_lock);
+ md_check_recovery(mddev);
+ spin_lock_irq(&conf->device_lock);
+ }
+ }
+ pr_debug("%d stripes handled\n", handled);
+
+ spin_unlock_irq(&conf->device_lock);
+ if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+ grow_one_stripe(conf, __GFP_NOWARN);
+ /* Set flag even if allocation failed. This helps
+ * slow down allocation requests when mem is short
+ */
+ set_bit(R5_DID_ALLOC, &conf->cache_state);
+ }
+
+ async_tx_issue_pending_all();
+ blk_finish_plug(&plug);
+
+ pr_debug("--- raid5d inactive\n");
+}
+
+static ssize_t
+raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf)
+ ret = sprintf(page, "%d\n", conf->min_nr_stripes);
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+int
+raid5_set_cache_size(struct mddev *mddev, int size)
+{
+ struct r5conf *conf = mddev->private;
+ int err;
+
+ if (size <= 16 || size > 32768)
+ return -EINVAL;
+
+ conf->min_nr_stripes = size;
+ while (size < conf->max_nr_stripes &&
+ drop_one_stripe(conf))
+ ;
+
+
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
+
+ while (size > conf->max_nr_stripes)
+ if (!grow_one_stripe(conf, GFP_KERNEL))
+ break;
+
+ return 0;
+}
+EXPORT_SYMBOL(raid5_set_cache_size);
+
+static ssize_t
+raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ unsigned long new;
+ int err;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else
+ err = raid5_set_cache_size(mddev, new);
+ mddev_unlock(mddev);
+
+ return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
+ raid5_show_stripe_cache_size,
+ raid5_store_stripe_cache_size);
+
+static ssize_t
+raid5_show_rmw_level(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf = mddev->private;
+ if (conf)
+ return sprintf(page, "%d\n", conf->rmw_level);
+ else
+ return 0;
+}
+
+static ssize_t
+raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf = mddev->private;
+ unsigned long new;
+
+ if (!conf)
+ return -ENODEV;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+
+ if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
+ return -EINVAL;
+
+ if (new != PARITY_DISABLE_RMW &&
+ new != PARITY_ENABLE_RMW &&
+ new != PARITY_PREFER_RMW)
+ return -EINVAL;
+
+ conf->rmw_level = new;
+ return len;
+}
+
+static struct md_sysfs_entry
+raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
+ raid5_show_rmw_level,
+ raid5_store_rmw_level);
+
+
+static ssize_t
+raid5_show_preread_threshold(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf)
+ ret = sprintf(page, "%d\n", conf->bypass_threshold);
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+static ssize_t
+raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ unsigned long new;
+ int err;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new > conf->min_nr_stripes)
+ err = -EINVAL;
+ else
+ conf->bypass_threshold = new;
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
+ S_IRUGO | S_IWUSR,
+ raid5_show_preread_threshold,
+ raid5_store_preread_threshold);
+
+static ssize_t
+raid5_show_skip_copy(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf)
+ ret = sprintf(page, "%d\n", conf->skip_copy);
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+static ssize_t
+raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ unsigned long new;
+ int err;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+ new = !!new;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->skip_copy) {
+ mddev_suspend(mddev);
+ conf->skip_copy = new;
+ if (new)
+ mddev->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+ else
+ mddev->queue->backing_dev_info.capabilities &=
+ ~BDI_CAP_STABLE_WRITES;
+ mddev_resume(mddev);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
+ raid5_show_skip_copy,
+ raid5_store_skip_copy);
+
+static ssize_t
+stripe_cache_active_show(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf = mddev->private;
+ if (conf)
+ return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
+ else
+ return 0;
+}
+
+static struct md_sysfs_entry
+raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
+
+static ssize_t
+raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf)
+ ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+ int *group_cnt,
+ int *worker_cnt_per_group,
+ struct r5worker_group **worker_groups);
+static ssize_t
+raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ unsigned long new;
+ int err;
+ struct r5worker_group *new_groups, *old_groups;
+ int group_cnt, worker_cnt_per_group;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->worker_cnt_per_group) {
+ mddev_suspend(mddev);
+
+ old_groups = conf->worker_groups;
+ if (old_groups)
+ flush_workqueue(raid5_wq);
+
+ err = alloc_thread_groups(conf, new,
+ &group_cnt, &worker_cnt_per_group,
+ &new_groups);
+ if (!err) {
+ spin_lock_irq(&conf->device_lock);
+ conf->group_cnt = group_cnt;
+ conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_groups = new_groups;
+ spin_unlock_irq(&conf->device_lock);
+
+ if (old_groups)
+ kfree(old_groups[0].workers);
+ kfree(old_groups);
+ }
+ mddev_resume(mddev);
+ }
+ mddev_unlock(mddev);
+
+ return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
+ raid5_show_group_thread_cnt,
+ raid5_store_group_thread_cnt);
+
+static struct attribute *raid5_attrs[] = {
+ &raid5_stripecache_size.attr,
+ &raid5_stripecache_active.attr,
+ &raid5_preread_bypass_threshold.attr,
+ &raid5_group_thread_cnt.attr,
+ &raid5_skip_copy.attr,
+ &raid5_rmw_level.attr,
+ NULL,
+};
+static struct attribute_group raid5_attrs_group = {
+ .name = NULL,
+ .attrs = raid5_attrs,
+};
+
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+ int *group_cnt,
+ int *worker_cnt_per_group,
+ struct r5worker_group **worker_groups)
+{
+ int i, j, k;
+ ssize_t size;
+ struct r5worker *workers;
+
+ *worker_cnt_per_group = cnt;
+ if (cnt == 0) {
+ *group_cnt = 0;
+ *worker_groups = NULL;
+ return 0;
+ }
+ *group_cnt = num_possible_nodes();
+ size = sizeof(struct r5worker) * cnt;
+ workers = kzalloc(size * *group_cnt, GFP_NOIO);
+ *worker_groups = kzalloc(sizeof(struct r5worker_group) *
+ *group_cnt, GFP_NOIO);
+ if (!*worker_groups || !workers) {
+ kfree(workers);
+ kfree(*worker_groups);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < *group_cnt; i++) {
+ struct r5worker_group *group;
+
+ group = &(*worker_groups)[i];
+ INIT_LIST_HEAD(&group->handle_list);
+ group->conf = conf;
+ group->workers = workers + i * cnt;
+
+ for (j = 0; j < cnt; j++) {
+ struct r5worker *worker = group->workers + j;
+ worker->group = group;
+ INIT_WORK(&worker->work, raid5_do_work);
+
+ for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+ INIT_LIST_HEAD(worker->temp_inactive_list + k);
+ }
+ }
+
+ return 0;
+}
+
+static void free_thread_groups(struct r5conf *conf)
+{
+ if (conf->worker_groups)
+ kfree(conf->worker_groups[0].workers);
+ kfree(conf->worker_groups);
+ conf->worker_groups = NULL;
+}
+
+static sector_t
+raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ struct r5conf *conf = mddev->private;
+
+ if (!sectors)
+ sectors = mddev->dev_sectors;
+ if (!raid_disks)
+ /* size is defined by the smallest of previous and new size */
+ raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
+
+ sectors &= ~((sector_t)mddev->chunk_sectors - 1);
+ sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
+ return sectors * (raid_disks - conf->max_degraded);
+}
+
+static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+ safe_put_page(percpu->spare_page);
+ if (percpu->scribble)
+ flex_array_free(percpu->scribble);
+ percpu->spare_page = NULL;
+ percpu->scribble = NULL;
+}
+
+static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+ if (conf->level == 6 && !percpu->spare_page)
+ percpu->spare_page = alloc_page(GFP_KERNEL);
+ if (!percpu->scribble)
+ percpu->scribble = scribble_alloc(max(conf->raid_disks,
+ conf->previous_raid_disks),
+ max(conf->chunk_sectors,
+ conf->prev_chunk_sectors)
+ / STRIPE_SECTORS,
+ GFP_KERNEL);
+
+ if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
+ free_scratch_buffer(conf, percpu);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void raid5_free_percpu(struct r5conf *conf)
+{
+ unsigned long cpu;
+
+ if (!conf->percpu)
+ return;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ unregister_cpu_notifier(&conf->cpu_notify);
+#endif
+
+ get_online_cpus();
+ for_each_possible_cpu(cpu)
+ free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
+ put_online_cpus();
+
+ free_percpu(conf->percpu);
+}
+
+static void free_conf(struct r5conf *conf)
+{
+ if (conf->shrinker.seeks)
+ unregister_shrinker(&conf->shrinker);
+ free_thread_groups(conf);
+ shrink_stripes(conf);
+ raid5_free_percpu(conf);
+ kfree(conf->disks);
+ kfree(conf->stripe_hashtbl);
+ kfree(conf);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
+ long cpu = (long)hcpu;
+ struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (alloc_scratch_buffer(conf, percpu)) {
+ pr_err("%s: failed memory allocation for cpu%ld\n",
+ __func__, cpu);
+ return notifier_from_errno(-ENOMEM);
+ }
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+#endif
+
+static int raid5_alloc_percpu(struct r5conf *conf)
+{
+ unsigned long cpu;
+ int err = 0;
+
+ conf->percpu = alloc_percpu(struct raid5_percpu);
+ if (!conf->percpu)
+ return -ENOMEM;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ conf->cpu_notify.notifier_call = raid456_cpu_notify;
+ conf->cpu_notify.priority = 0;
+ err = register_cpu_notifier(&conf->cpu_notify);
+ if (err)
+ return err;
+#endif
+
+ get_online_cpus();
+ for_each_present_cpu(cpu) {
+ err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
+ if (err) {
+ pr_err("%s: failed memory allocation for cpu%ld\n",
+ __func__, cpu);
+ break;
+ }
+ }
+ put_online_cpus();
+
+ return err;
+}
+
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+ int ret = 0;
+ while (ret < sc->nr_to_scan) {
+ if (drop_one_stripe(conf) == 0)
+ return SHRINK_STOP;
+ ret++;
+ }
+ return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+ if (conf->max_nr_stripes < conf->min_nr_stripes)
+ /* unlikely, but not impossible */
+ return 0;
+ return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
+static struct r5conf *setup_conf(struct mddev *mddev)
+{
+ struct r5conf *conf;
+ int raid_disk, memory, max_disks;
+ struct md_rdev *rdev;
+ struct disk_info *disk;
+ char pers_name[6];
+ int i;
+ int group_cnt, worker_cnt_per_group;
+ struct r5worker_group *new_group;
+
+ if (mddev->new_level != 5
+ && mddev->new_level != 4
+ && mddev->new_level != 6) {
+ printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
+ mdname(mddev), mddev->new_level);
+ return ERR_PTR(-EIO);
+ }
+ if ((mddev->new_level == 5
+ && !algorithm_valid_raid5(mddev->new_layout)) ||
+ (mddev->new_level == 6
+ && !algorithm_valid_raid6(mddev->new_layout))) {
+ printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
+ mdname(mddev), mddev->new_layout);
+ return ERR_PTR(-EIO);
+ }
+ if (mddev->new_level == 6 && mddev->raid_disks < 4) {
+ printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
+ mdname(mddev), mddev->raid_disks);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!mddev->new_chunk_sectors ||
+ (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
+ !is_power_of_2(mddev->new_chunk_sectors)) {
+ printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
+ mdname(mddev), mddev->new_chunk_sectors << 9);
+ return ERR_PTR(-EINVAL);
+ }
+
+ conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
+ if (conf == NULL)
+ goto abort;
+ /* Don't enable multi-threading by default*/
+ if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
+ &new_group)) {
+ conf->group_cnt = group_cnt;
+ conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_groups = new_group;
+ } else
+ goto abort;
+ spin_lock_init(&conf->device_lock);
+ seqcount_init(&conf->gen_lock);
+ init_waitqueue_head(&conf->wait_for_stripe);
+ init_waitqueue_head(&conf->wait_for_overlap);
+ INIT_LIST_HEAD(&conf->handle_list);
+ INIT_LIST_HEAD(&conf->hold_list);
+ INIT_LIST_HEAD(&conf->delayed_list);
+ INIT_LIST_HEAD(&conf->bitmap_list);
+ init_llist_head(&conf->released_stripes);
+ atomic_set(&conf->active_stripes, 0);
+ atomic_set(&conf->preread_active_stripes, 0);
+ atomic_set(&conf->active_aligned_reads, 0);
+ conf->bypass_threshold = BYPASS_THRESHOLD;
+ conf->recovery_disabled = mddev->recovery_disabled - 1;
+
+ conf->raid_disks = mddev->raid_disks;
+ if (mddev->reshape_position == MaxSector)
+ conf->previous_raid_disks = mddev->raid_disks;
+ else
+ conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
+ max_disks = max(conf->raid_disks, conf->previous_raid_disks);
+
+ conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
+ GFP_KERNEL);
+ if (!conf->disks)
+ goto abort;
+
+ conf->mddev = mddev;
+
+ if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
+ goto abort;
+
+ /* We init hash_locks[0] separately to that it can be used
+ * as the reference lock in the spin_lock_nest_lock() call
+ * in lock_all_device_hash_locks_irq in order to convince
+ * lockdep that we know what we are doing.
+ */
+ spin_lock_init(conf->hash_locks);
+ for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+ spin_lock_init(conf->hash_locks + i);
+
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(conf->inactive_list + i);
+
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(conf->temp_inactive_list + i);
+
+ conf->level = mddev->new_level;
+ conf->chunk_sectors = mddev->new_chunk_sectors;
+ if (raid5_alloc_percpu(conf) != 0)
+ goto abort;
+
+ pr_debug("raid456: run(%s) called.\n", mdname(mddev));
+
+ rdev_for_each(rdev, mddev) {
+ raid_disk = rdev->raid_disk;
+ if (raid_disk >= max_disks
+ || raid_disk < 0)
+ continue;
+ disk = conf->disks + raid_disk;
+
+ if (test_bit(Replacement, &rdev->flags)) {
+ if (disk->replacement)
+ goto abort;
+ disk->replacement = rdev;
+ } else {
+ if (disk->rdev)
+ goto abort;
+ disk->rdev = rdev;
+ }
+
+ if (test_bit(In_sync, &rdev->flags)) {
+ char b[BDEVNAME_SIZE];
+ printk(KERN_INFO "md/raid:%s: device %s operational as raid"
+ " disk %d\n",
+ mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
+ } else if (rdev->saved_raid_disk != raid_disk)
+ /* Cannot rely on bitmap to complete recovery */
+ conf->fullsync = 1;
+ }
+
+ conf->level = mddev->new_level;
+ if (conf->level == 6) {
+ conf->max_degraded = 2;
+ if (raid6_call.xor_syndrome)
+ conf->rmw_level = PARITY_ENABLE_RMW;
+ else
+ conf->rmw_level = PARITY_DISABLE_RMW;
+ } else {
+ conf->max_degraded = 1;
+ conf->rmw_level = PARITY_ENABLE_RMW;
+ }
+ conf->algorithm = mddev->new_layout;
+ conf->reshape_progress = mddev->reshape_position;
+ if (conf->reshape_progress != MaxSector) {
+ conf->prev_chunk_sectors = mddev->chunk_sectors;
+ conf->prev_algo = mddev->layout;
+ }
+
+ conf->min_nr_stripes = NR_STRIPES;
+ memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
+ max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
+ atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
+ if (grow_stripes(conf, conf->min_nr_stripes)) {
+ printk(KERN_ERR
+ "md/raid:%s: couldn't allocate %dkB for buffers\n",
+ mdname(mddev), memory);
+ goto abort;
+ } else
+ printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
+ mdname(mddev), memory);
+ /*
+ * Losing a stripe head costs more than the time to refill it,
+ * it reduces the queue depth and so can hurt throughput.
+ * So set it rather large, scaled by number of devices.
+ */
+ conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+ conf->shrinker.scan_objects = raid5_cache_scan;
+ conf->shrinker.count_objects = raid5_cache_count;
+ conf->shrinker.batch = 128;
+ conf->shrinker.flags = 0;
+ register_shrinker(&conf->shrinker);
+
+ sprintf(pers_name, "raid%d", mddev->new_level);
+ conf->thread = md_register_thread(raid5d, mddev, pers_name);
+ if (!conf->thread) {
+ printk(KERN_ERR
+ "md/raid:%s: couldn't allocate thread.\n",
+ mdname(mddev));
+ goto abort;
+ }
+
+ return conf;
+
+ abort:
+ if (conf) {
+ free_conf(conf);
+ return ERR_PTR(-EIO);
+ } else
+ return ERR_PTR(-ENOMEM);
+}
+
+static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
+{
+ switch (algo) {
+ case ALGORITHM_PARITY_0:
+ if (raid_disk < max_degraded)
+ return 1;
+ break;
+ case ALGORITHM_PARITY_N:
+ if (raid_disk >= raid_disks - max_degraded)
+ return 1;
+ break;
+ case ALGORITHM_PARITY_0_6:
+ if (raid_disk == 0 ||
+ raid_disk == raid_disks - 1)
+ return 1;
+ break;
+ case ALGORITHM_LEFT_ASYMMETRIC_6:
+ case ALGORITHM_RIGHT_ASYMMETRIC_6:
+ case ALGORITHM_LEFT_SYMMETRIC_6:
+ case ALGORITHM_RIGHT_SYMMETRIC_6:
+ if (raid_disk == raid_disks - 1)
+ return 1;
+ }
+ return 0;
+}
+
+static int run(struct mddev *mddev)
+{
+ struct r5conf *conf;
+ int working_disks = 0;
+ int dirty_parity_disks = 0;
+ struct md_rdev *rdev;
+ sector_t reshape_offset = 0;
+ int i;
+ long long min_offset_diff = 0;
+ int first = 1;
+
+ if (mddev->recovery_cp != MaxSector)
+ printk(KERN_NOTICE "md/raid:%s: not clean"
+ " -- starting background reconstruction\n",
+ mdname(mddev));
+
+ rdev_for_each(rdev, mddev) {
+ long long diff;
+ if (rdev->raid_disk < 0)
+ continue;
+ diff = (rdev->new_data_offset - rdev->data_offset);
+ if (first) {
+ min_offset_diff = diff;
+ first = 0;
+ } else if (mddev->reshape_backwards &&
+ diff < min_offset_diff)
+ min_offset_diff = diff;
+ else if (!mddev->reshape_backwards &&
+ diff > min_offset_diff)
+ min_offset_diff = diff;
+ }
+
+ if (mddev->reshape_position != MaxSector) {
+ /* Check that we can continue the reshape.
+ * Difficulties arise if the stripe we would write to
+ * next is at or after the stripe we would read from next.
+ * For a reshape that changes the number of devices, this
+ * is only possible for a very short time, and mdadm makes
+ * sure that time appears to have past before assembling
+ * the array. So we fail if that time hasn't passed.
+ * For a reshape that keeps the number of devices the same
+ * mdadm must be monitoring the reshape can keeping the
+ * critical areas read-only and backed up. It will start
+ * the array in read-only mode, so we check for that.
+ */
+ sector_t here_new, here_old;
+ int old_disks;
+ int max_degraded = (mddev->level == 6 ? 2 : 1);
+
+ if (mddev->new_level != mddev->level) {
+ printk(KERN_ERR "md/raid:%s: unsupported reshape "
+ "required - aborting.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ old_disks = mddev->raid_disks - mddev->delta_disks;
+ /* reshape_position must be on a new-stripe boundary, and one
+ * further up in new geometry must map after here in old
+ * geometry.
+ */
+ here_new = mddev->reshape_position;
+ if (sector_div(here_new, mddev->new_chunk_sectors *
+ (mddev->raid_disks - max_degraded))) {
+ printk(KERN_ERR "md/raid:%s: reshape_position not "
+ "on a stripe boundary\n", mdname(mddev));
+ return -EINVAL;
+ }
+ reshape_offset = here_new * mddev->new_chunk_sectors;
+ /* here_new is the stripe we will write to */
+ here_old = mddev->reshape_position;
+ sector_div(here_old, mddev->chunk_sectors *
+ (old_disks-max_degraded));
+ /* here_old is the first stripe that we might need to read
+ * from */
+ if (mddev->delta_disks == 0) {
+ if ((here_new * mddev->new_chunk_sectors !=
+ here_old * mddev->chunk_sectors)) {
+ printk(KERN_ERR "md/raid:%s: reshape position is"
+ " confused - aborting\n", mdname(mddev));
+ return -EINVAL;
+ }
+ /* We cannot be sure it is safe to start an in-place
+ * reshape. It is only safe if user-space is monitoring
+ * and taking constant backups.
+ * mdadm always starts a situation like this in
+ * readonly mode so it can take control before
+ * allowing any writes. So just check for that.
+ */
+ if (abs(min_offset_diff) >= mddev->chunk_sectors &&
+ abs(min_offset_diff) >= mddev->new_chunk_sectors)
+ /* not really in-place - so OK */;
+ else if (mddev->ro == 0) {
+ printk(KERN_ERR "md/raid:%s: in-place reshape "
+ "must be started in read-only mode "
+ "- aborting\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ } else if (mddev->reshape_backwards
+ ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
+ here_old * mddev->chunk_sectors)
+ : (here_new * mddev->new_chunk_sectors >=
+ here_old * mddev->chunk_sectors + (-min_offset_diff))) {
+ /* Reading from the same stripe as writing to - bad */
+ printk(KERN_ERR "md/raid:%s: reshape_position too early for "
+ "auto-recovery - aborting.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ printk(KERN_INFO "md/raid:%s: reshape will continue\n",
+ mdname(mddev));
+ /* OK, we should be able to continue; */
+ } else {
+ BUG_ON(mddev->level != mddev->new_level);
+ BUG_ON(mddev->layout != mddev->new_layout);
+ BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
+ BUG_ON(mddev->delta_disks != 0);
+ }
+
+ if (mddev->private == NULL)
+ conf = setup_conf(mddev);
+ else
+ conf = mddev->private;
+
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+
+ conf->min_offset_diff = min_offset_diff;
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+ mddev->private = conf;
+
+ for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
+ i++) {
+ rdev = conf->disks[i].rdev;
+ if (!rdev && conf->disks[i].replacement) {
+ /* The replacement is all we have yet */
+ rdev = conf->disks[i].replacement;
+ conf->disks[i].replacement = NULL;
+ clear_bit(Replacement, &rdev->flags);
+ conf->disks[i].rdev = rdev;
+ }
+ if (!rdev)
+ continue;
+ if (conf->disks[i].replacement &&
+ conf->reshape_progress != MaxSector) {
+ /* replacements and reshape simply do not mix. */
+ printk(KERN_ERR "md: cannot handle concurrent "
+ "replacement and reshape.\n");
+ goto abort;
+ }
+ if (test_bit(In_sync, &rdev->flags)) {
+ working_disks++;
+ continue;
+ }
+ /* This disc is not fully in-sync. However if it
+ * just stored parity (beyond the recovery_offset),
+ * when we don't need to be concerned about the
+ * array being dirty.
+ * When reshape goes 'backwards', we never have
+ * partially completed devices, so we only need
+ * to worry about reshape going forwards.
+ */
+ /* Hack because v0.91 doesn't store recovery_offset properly. */
+ if (mddev->major_version == 0 &&
+ mddev->minor_version > 90)
+ rdev->recovery_offset = reshape_offset;
+
+ if (rdev->recovery_offset < reshape_offset) {
+ /* We need to check old and new layout */
+ if (!only_parity(rdev->raid_disk,
+ conf->algorithm,
+ conf->raid_disks,
+ conf->max_degraded))
+ continue;
+ }
+ if (!only_parity(rdev->raid_disk,
+ conf->prev_algo,
+ conf->previous_raid_disks,
+ conf->max_degraded))
+ continue;
+ dirty_parity_disks++;
+ }
+
+ /*
+ * 0 for a fully functional array, 1 or 2 for a degraded array.
+ */
+ mddev->degraded = calc_degraded(conf);
+
+ if (has_failed(conf)) {
+ printk(KERN_ERR "md/raid:%s: not enough operational devices"
+ " (%d/%d failed)\n",
+ mdname(mddev), mddev->degraded, conf->raid_disks);
+ goto abort;
+ }
+
+ /* device size must be a multiple of chunk size */
+ mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
+ mddev->resync_max_sectors = mddev->dev_sectors;
+
+ if (mddev->degraded > dirty_parity_disks &&
+ mddev->recovery_cp != MaxSector) {
+ if (mddev->ok_start_degraded)
+ printk(KERN_WARNING
+ "md/raid:%s: starting dirty degraded array"
+ " - data corruption possible.\n",
+ mdname(mddev));
+ else {
+ printk(KERN_ERR
+ "md/raid:%s: cannot start dirty degraded array.\n",
+ mdname(mddev));
+ goto abort;
+ }
+ }
+
+ if (mddev->degraded == 0)
+ printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
+ " devices, algorithm %d\n", mdname(mddev), conf->level,
+ mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+ mddev->new_layout);
+ else
+ printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
+ " out of %d devices, algorithm %d\n",
+ mdname(mddev), conf->level,
+ mddev->raid_disks - mddev->degraded,
+ mddev->raid_disks, mddev->new_layout);
+
+ print_raid5_conf(conf);
+
+ if (conf->reshape_progress != MaxSector) {
+ conf->reshape_safe = conf->reshape_progress;
+ atomic_set(&conf->reshape_stripes, 0);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+ "reshape");
+ }
+
+ /* Ok, everything is just fine now */
+ if (mddev->to_remove == &raid5_attrs_group)
+ mddev->to_remove = NULL;
+ else if (mddev->kobj.sd &&
+ sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
+ printk(KERN_WARNING
+ "raid5: failed to create sysfs attributes for %s\n",
+ mdname(mddev));
+ md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+
+ if (mddev->queue) {
+ int chunk_size;
+ bool discard_supported = true;
+ /* read-ahead size must cover two whole stripes, which
+ * is 2 * (datadisks) * chunksize where 'n' is the
+ * number of raid devices
+ */
+ int data_disks = conf->previous_raid_disks - conf->max_degraded;
+ int stripe = data_disks *
+ ((mddev->chunk_sectors << 9) / PAGE_SIZE);
+ if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+ mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+
+ chunk_size = mddev->chunk_sectors << 9;
+ blk_queue_io_min(mddev->queue, chunk_size);
+ blk_queue_io_opt(mddev->queue, chunk_size *
+ (conf->raid_disks - conf->max_degraded));
+ mddev->queue->limits.raid_partial_stripes_expensive = 1;
+ /*
+ * We can only discard a whole stripe. It doesn't make sense to
+ * discard data disk but write parity disk
+ */
+ stripe = stripe * PAGE_SIZE;
+ /* Round up to power of 2, as discard handling
+ * currently assumes that */
+ while ((stripe-1) & stripe)
+ stripe = (stripe | (stripe-1)) + 1;
+ mddev->queue->limits.discard_alignment = stripe;
+ mddev->queue->limits.discard_granularity = stripe;
+ /*
+ * unaligned part of discard request will be ignored, so can't
+ * guarantee discard_zeroes_data
+ */
+ mddev->queue->limits.discard_zeroes_data = 0;
+
+ blk_queue_max_write_same_sectors(mddev->queue, 0);
+
+ rdev_for_each(rdev, mddev) {
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->new_data_offset << 9);
+ /*
+ * discard_zeroes_data is required, otherwise data
+ * could be lost. Consider a scenario: discard a stripe
+ * (the stripe could be inconsistent if
+ * discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again
+ * depending on which disks are used to calculate
+ * parity); the disk is broken; The stripe data of this
+ * disk is lost.
+ */
+ if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+ !bdev_get_queue(rdev->bdev)->
+ limits.discard_zeroes_data)
+ discard_supported = false;
+ /* Unfortunately, discard_zeroes_data is not currently
+ * a guarantee - just a hint. So we only allow DISCARD
+ * if the sysadmin has confirmed that only safe devices
+ * are in use by setting a module parameter.
+ */
+ if (!devices_handle_discard_safely) {
+ if (discard_supported) {
+ pr_info("md/raid456: discard support disabled due to uncertainty.\n");
+ pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
+ }
+ discard_supported = false;
+ }
+ }
+
+ if (discard_supported &&
+ mddev->queue->limits.max_discard_sectors >= stripe &&
+ mddev->queue->limits.discard_granularity >= stripe)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ else
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+ mddev->queue);
+ }
+
+ return 0;
+abort:
+ md_unregister_thread(&mddev->thread);
+ print_raid5_conf(conf);
+ free_conf(conf);
+ mddev->private = NULL;
+ printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
+ return -EIO;
+}
+
+static void raid5_free(struct mddev *mddev, void *priv)
+{
+ struct r5conf *conf = priv;
+
+ free_conf(conf);
+ mddev->to_remove = &raid5_attrs_group;
+}
+
+static void status(struct seq_file *seq, struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+ int i;
+
+ seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
+ mddev->chunk_sectors / 2, mddev->layout);
+ seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
+ for (i = 0; i < conf->raid_disks; i++)
+ seq_printf (seq, "%s",
+ conf->disks[i].rdev &&
+ test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
+ seq_printf (seq, "]");
+}
+
+static void print_raid5_conf (struct r5conf *conf)
+{
+ int i;
+ struct disk_info *tmp;
+
+ printk(KERN_DEBUG "RAID conf printout:\n");
+ if (!conf) {
+ printk("(conf==NULL)\n");
+ return;
+ }
+ printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
+ conf->raid_disks,
+ conf->raid_disks - conf->mddev->degraded);
+
+ for (i = 0; i < conf->raid_disks; i++) {
+ char b[BDEVNAME_SIZE];
+ tmp = conf->disks + i;
+ if (tmp->rdev)
+ printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
+ i, !test_bit(Faulty, &tmp->rdev->flags),
+ bdevname(tmp->rdev->bdev, b));
+ }
+}
+
+static int raid5_spare_active(struct mddev *mddev)
+{
+ int i;
+ struct r5conf *conf = mddev->private;
+ struct disk_info *tmp;
+ int count = 0;
+ unsigned long flags;
+
+ for (i = 0; i < conf->raid_disks; i++) {
+ tmp = conf->disks + i;
+ if (tmp->replacement
+ && tmp->replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &tmp->replacement->flags)
+ && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ /* Replacement has just become active. */
+ if (!tmp->rdev
+ || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ count++;
+ if (tmp->rdev) {
+ /* Replaced device not technically faulty,
+ * but we need to be sure it gets removed
+ * and never re-added.
+ */
+ set_bit(Faulty, &tmp->rdev->flags);
+ sysfs_notify_dirent_safe(
+ tmp->rdev->sysfs_state);
+ }
+ sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+ } else if (tmp->rdev
+ && tmp->rdev->recovery_offset == MaxSector
+ && !test_bit(Faulty, &tmp->rdev->flags)
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ count++;
+ sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
+ }
+ }
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded = calc_degraded(conf);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ print_raid5_conf(conf);
+ return count;
+}
+
+static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct r5conf *conf = mddev->private;
+ int err = 0;
+ int number = rdev->raid_disk;
+ struct md_rdev **rdevp;
+ struct disk_info *p = conf->disks + number;
+
+ print_raid5_conf(conf);
+ if (rdev == p->rdev)
+ rdevp = &p->rdev;
+ else if (rdev == p->replacement)
+ rdevp = &p->replacement;
+ else
+ return 0;
+
+ if (number >= conf->raid_disks &&
+ conf->reshape_progress == MaxSector)
+ clear_bit(In_sync, &rdev->flags);
+
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ err = -EBUSY;
+ goto abort;
+ }
+ /* Only remove non-faulty devices if recovery
+ * isn't possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->recovery_disabled != conf->recovery_disabled &&
+ !has_failed(conf) &&
+ (!p->replacement || p->replacement == rdev) &&
+ number < conf->raid_disks) {
+ err = -EBUSY;
+ goto abort;
+ }
+ *rdevp = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ *rdevp = rdev;
+ } else if (p->replacement) {
+ /* We must have just cleared 'rdev' */
+ p->rdev = p->replacement;
+ clear_bit(Replacement, &p->replacement->flags);
+ smp_mb(); /* Make sure other CPUs may see both as identical
+ * but will never see neither - if they are careful
+ */
+ p->replacement = NULL;
+ clear_bit(WantReplacement, &rdev->flags);
+ } else
+ /* We might have just removed the Replacement as faulty-
+ * clear the bit just in case
+ */
+ clear_bit(WantReplacement, &rdev->flags);
+abort:
+
+ print_raid5_conf(conf);
+ return err;
+}
+
+static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct r5conf *conf = mddev->private;
+ int err = -EEXIST;
+ int disk;
+ struct disk_info *p;
+ int first = 0;
+ int last = conf->raid_disks - 1;
+
+ if (mddev->recovery_disabled == conf->recovery_disabled)
+ return -EBUSY;
+
+ if (rdev->saved_raid_disk < 0 && has_failed(conf))
+ /* no point adding a device */
+ return -EINVAL;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
+
+ /*
+ * find the disk ... but prefer rdev->saved_raid_disk
+ * if possible.
+ */
+ if (rdev->saved_raid_disk >= 0 &&
+ rdev->saved_raid_disk >= first &&
+ conf->disks[rdev->saved_raid_disk].rdev == NULL)
+ first = rdev->saved_raid_disk;
+
+ for (disk = first; disk <= last; disk++) {
+ p = conf->disks + disk;
+ if (p->rdev == NULL) {
+ clear_bit(In_sync, &rdev->flags);
+ rdev->raid_disk = disk;
+ err = 0;
+ if (rdev->saved_raid_disk != disk)
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->rdev, rdev);
+ goto out;
+ }
+ }
+ for (disk = first; disk <= last; disk++) {
+ p = conf->disks + disk;
+ if (test_bit(WantReplacement, &p->rdev->flags) &&
+ p->replacement == NULL) {
+ clear_bit(In_sync, &rdev->flags);
+ set_bit(Replacement, &rdev->flags);
+ rdev->raid_disk = disk;
+ err = 0;
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->replacement, rdev);
+ break;
+ }
+ }
+out:
+ print_raid5_conf(conf);
+ return err;
+}
+
+static int raid5_resize(struct mddev *mddev, sector_t sectors)
+{
+ /* no resync is happening, and there is enough space
+ * on all devices, so we can resize.
+ * We need to make sure resync covers any new space.
+ * If the array is shrinking we should possibly wait until
+ * any io in the removed space completes, but it hardly seems
+ * worth it.
+ */
+ sector_t newsize;
+ sectors &= ~((sector_t)mddev->chunk_sectors - 1);
+ newsize = raid5_size(mddev, sectors, mddev->raid_disks);
+ if (mddev->external_size &&
+ mddev->array_sectors > newsize)
+ return -EINVAL;
+ if (mddev->bitmap) {
+ int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
+ if (ret)
+ return ret;
+ }
+ md_set_array_sectors(mddev, newsize);
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ if (sectors > mddev->dev_sectors &&
+ mddev->recovery_cp > mddev->dev_sectors) {
+ mddev->recovery_cp = mddev->dev_sectors;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ mddev->dev_sectors = sectors;
+ mddev->resync_max_sectors = sectors;
+ return 0;
+}
+
+static int check_stripe_cache(struct mddev *mddev)
+{
+ /* Can only proceed if there are plenty of stripe_heads.
+ * We need a minimum of one full stripe,, and for sensible progress
+ * it is best to have about 4 times that.
+ * If we require 4 times, then the default 256 4K stripe_heads will
+ * allow for chunk sizes up to 256K, which is probably OK.
+ * If the chunk size is greater, user-space should request more
+ * stripe_heads first.
+ */
+ struct r5conf *conf = mddev->private;
+ if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+ > conf->min_nr_stripes ||
+ ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+ > conf->min_nr_stripes) {
+ printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
+ mdname(mddev),
+ ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+ / STRIPE_SIZE)*4);
+ return 0;
+ }
+ return 1;
+}
+
+static int check_reshape(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ if (mddev->delta_disks == 0 &&
+ mddev->new_layout == mddev->layout &&
+ mddev->new_chunk_sectors == mddev->chunk_sectors)
+ return 0; /* nothing to do */
+ if (has_failed(conf))
+ return -EINVAL;
+ if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
+ /* We might be able to shrink, but the devices must
+ * be made bigger first.
+ * For raid6, 4 is the minimum size.
+ * Otherwise 2 is the minimum
+ */
+ int min = 2;
+ if (mddev->level == 6)
+ min = 4;
+ if (mddev->raid_disks + mddev->delta_disks < min)
+ return -EINVAL;
+ }
+
+ if (!check_stripe_cache(mddev))
+ return -ENOSPC;
+
+ if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
+ mddev->delta_disks > 0)
+ if (resize_chunks(conf,
+ conf->previous_raid_disks
+ + max(0, mddev->delta_disks),
+ max(mddev->new_chunk_sectors,
+ mddev->chunk_sectors)
+ ) < 0)
+ return -ENOMEM;
+ return resize_stripes(conf, (conf->previous_raid_disks
+ + mddev->delta_disks));
+}
+
+static int raid5_start_reshape(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+ struct md_rdev *rdev;
+ int spares = 0;
+ unsigned long flags;
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return -EBUSY;
+
+ if (!check_stripe_cache(mddev))
+ return -ENOSPC;
+
+ if (has_failed(conf))
+ return -EINVAL;
+
+ rdev_for_each(rdev, mddev) {
+ if (!test_bit(In_sync, &rdev->flags)
+ && !test_bit(Faulty, &rdev->flags))
+ spares++;
+ }
+
+ if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
+ /* Not enough devices even to make a degraded array
+ * of that size
+ */
+ return -EINVAL;
+
+ /* Refuse to reduce size of the array. Any reductions in
+ * array size must be through explicit setting of array_size
+ * attribute.
+ */
+ if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
+ < mddev->array_sectors) {
+ printk(KERN_ERR "md/raid:%s: array size must be reduced "
+ "before number of disks\n", mdname(mddev));
+ return -EINVAL;
+ }
+
+ atomic_set(&conf->reshape_stripes, 0);
+ spin_lock_irq(&conf->device_lock);
+ write_seqcount_begin(&conf->gen_lock);
+ conf->previous_raid_disks = conf->raid_disks;
+ conf->raid_disks += mddev->delta_disks;
+ conf->prev_chunk_sectors = conf->chunk_sectors;
+ conf->chunk_sectors = mddev->new_chunk_sectors;
+ conf->prev_algo = conf->algorithm;
+ conf->algorithm = mddev->new_layout;
+ conf->generation++;
+ /* Code that selects data_offset needs to see the generation update
+ * if reshape_progress has been set - so a memory barrier needed.
+ */
+ smp_mb();
+ if (mddev->reshape_backwards)
+ conf->reshape_progress = raid5_size(mddev, 0, 0);
+ else
+ conf->reshape_progress = 0;
+ conf->reshape_safe = conf->reshape_progress;
+ write_seqcount_end(&conf->gen_lock);
+ spin_unlock_irq(&conf->device_lock);
+
+ /* Now make sure any requests that proceeded on the assumption
+ * the reshape wasn't running - like Discard or Read - have
+ * completed.
+ */
+ mddev_suspend(mddev);
+ mddev_resume(mddev);
+
+ /* Add some new drives, as many as will fit.
+ * We know there are enough to make the newly sized array work.
+ * Don't add devices if we are reducing the number of
+ * devices in the array. This is because it is not possible
+ * to correctly record the "partially reconstructed" state of
+ * such devices during the reshape and confusion could result.
+ */
+ if (mddev->delta_disks >= 0) {
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk < 0 &&
+ !test_bit(Faulty, &rdev->flags)) {
+ if (raid5_add_disk(mddev, rdev) == 0) {
+ if (rdev->raid_disk
+ >= conf->previous_raid_disks)
+ set_bit(In_sync, &rdev->flags);
+ else
+ rdev->recovery_offset = 0;
+
+ if (sysfs_link_rdev(mddev, rdev))
+ /* Failure here is OK */;
+ }
+ } else if (rdev->raid_disk >= conf->previous_raid_disks
+ && !test_bit(Faulty, &rdev->flags)) {
+ /* This is a spare that was manually added */
+ set_bit(In_sync, &rdev->flags);
+ }
+
+ /* When a reshape changes the number of devices,
+ * ->degraded is measured against the larger of the
+ * pre and post number of devices.
+ */
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded = calc_degraded(conf);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+ mddev->raid_disks = conf->raid_disks;
+ mddev->reshape_position = conf->reshape_progress;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+ "reshape");
+ if (!mddev->sync_thread) {
+ mddev->recovery = 0;
+ spin_lock_irq(&conf->device_lock);
+ write_seqcount_begin(&conf->gen_lock);
+ mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+ mddev->new_chunk_sectors =
+ conf->chunk_sectors = conf->prev_chunk_sectors;
+ mddev->new_layout = conf->algorithm = conf->prev_algo;
+ rdev_for_each(rdev, mddev)
+ rdev->new_data_offset = rdev->data_offset;
+ smp_wmb();
+ conf->generation --;
+ conf->reshape_progress = MaxSector;
+ mddev->reshape_position = MaxSector;
+ write_seqcount_end(&conf->gen_lock);
+ spin_unlock_irq(&conf->device_lock);
+ return -EAGAIN;
+ }
+ conf->reshape_checkpoint = jiffies;
+ md_wakeup_thread(mddev->sync_thread);
+ md_new_event(mddev);
+ return 0;
+}
+
+/* This is called from the reshape thread and should make any
+ * changes needed in 'conf'
+ */
+static void end_reshape(struct r5conf *conf)
+{
+
+ if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+ struct md_rdev *rdev;
+
+ spin_lock_irq(&conf->device_lock);
+ conf->previous_raid_disks = conf->raid_disks;
+ rdev_for_each(rdev, conf->mddev)
+ rdev->data_offset = rdev->new_data_offset;
+ smp_wmb();
+ conf->reshape_progress = MaxSector;
+ spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_for_overlap);
+
+ /* read-ahead size must cover two whole stripes, which is
+ * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
+ */
+ if (conf->mddev->queue) {
+ int data_disks = conf->raid_disks - conf->max_degraded;
+ int stripe = data_disks * ((conf->chunk_sectors << 9)
+ / PAGE_SIZE);
+ if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+ conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ }
+ }
+}
+
+/* This is called from the raid5d thread with mddev_lock held.
+ * It makes config changes to the device.
+ */
+static void raid5_finish_reshape(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+
+ if (mddev->delta_disks > 0) {
+ md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ } else {
+ int d;
+ spin_lock_irq(&conf->device_lock);
+ mddev->degraded = calc_degraded(conf);
+ spin_unlock_irq(&conf->device_lock);
+ for (d = conf->raid_disks ;
+ d < conf->raid_disks - mddev->delta_disks;
+ d++) {
+ struct md_rdev *rdev = conf->disks[d].rdev;
+ if (rdev)
+ clear_bit(In_sync, &rdev->flags);
+ rdev = conf->disks[d].replacement;
+ if (rdev)
+ clear_bit(In_sync, &rdev->flags);
+ }
+ }
+ mddev->layout = conf->algorithm;
+ mddev->chunk_sectors = conf->chunk_sectors;
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ }
+}
+
+static void raid5_quiesce(struct mddev *mddev, int state)
+{
+ struct r5conf *conf = mddev->private;
+
+ switch(state) {
+ case 2: /* resume for a suspend */
+ wake_up(&conf->wait_for_overlap);
+ break;
+
+ case 1: /* stop all writes */
+ lock_all_device_hash_locks_irq(conf);
+ /* '2' tells resync/reshape to pause so that all
+ * active stripes can drain
+ */
+ conf->quiesce = 2;
+ wait_event_cmd(conf->wait_for_stripe,
+ atomic_read(&conf->active_stripes) == 0 &&
+ atomic_read(&conf->active_aligned_reads) == 0,
+ unlock_all_device_hash_locks_irq(conf),
+ lock_all_device_hash_locks_irq(conf));
+ conf->quiesce = 1;
+ unlock_all_device_hash_locks_irq(conf);
+ /* allow reshape to continue */
+ wake_up(&conf->wait_for_overlap);
+ break;
+
+ case 0: /* re-enable writes */
+ lock_all_device_hash_locks_irq(conf);
+ conf->quiesce = 0;
+ wake_up(&conf->wait_for_stripe);
+ wake_up(&conf->wait_for_overlap);
+ unlock_all_device_hash_locks_irq(conf);
+ break;
+ }
+}
+
+static void *raid45_takeover_raid0(struct mddev *mddev, int level)
+{
+ struct r0conf *raid0_conf = mddev->private;
+ sector_t sectors;
+
+ /* for raid0 takeover only one zone is supported */
+ if (raid0_conf->nr_strip_zones > 1) {
+ printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
+ mdname(mddev));
+ return ERR_PTR(-EINVAL);
+ }
+
+ sectors = raid0_conf->strip_zone[0].zone_end;
+ sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
+ mddev->dev_sectors = sectors;
+ mddev->new_level = level;
+ mddev->new_layout = ALGORITHM_PARITY_N;
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->raid_disks += 1;
+ mddev->delta_disks = 1;
+ /* make sure it will be not marked as dirty */
+ mddev->recovery_cp = MaxSector;
+
+ return setup_conf(mddev);
+}
+
+static void *raid5_takeover_raid1(struct mddev *mddev)
+{
+ int chunksect;
+
+ if (mddev->raid_disks != 2 ||
+ mddev->degraded > 1)
+ return ERR_PTR(-EINVAL);
+
+ /* Should check if there are write-behind devices? */
+
+ chunksect = 64*2; /* 64K by default */
+
+ /* The array must be an exact multiple of chunksize */
+ while (chunksect && (mddev->array_sectors & (chunksect-1)))
+ chunksect >>= 1;
+
+ if ((chunksect<<9) < STRIPE_SIZE)
+ /* array size does not allow a suitable chunk size */
+ return ERR_PTR(-EINVAL);
+
+ mddev->new_level = 5;
+ mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
+ mddev->new_chunk_sectors = chunksect;
+
+ return setup_conf(mddev);
+}
+
+static void *raid5_takeover_raid6(struct mddev *mddev)
+{
+ int new_layout;
+
+ switch (mddev->layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC_6:
+ new_layout = ALGORITHM_LEFT_ASYMMETRIC;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC_6:
+ new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC_6:
+ new_layout = ALGORITHM_LEFT_SYMMETRIC;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC_6:
+ new_layout = ALGORITHM_RIGHT_SYMMETRIC;
+ break;
+ case ALGORITHM_PARITY_0_6:
+ new_layout = ALGORITHM_PARITY_0;
+ break;
+ case ALGORITHM_PARITY_N:
+ new_layout = ALGORITHM_PARITY_N;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ mddev->new_level = 5;
+ mddev->new_layout = new_layout;
+ mddev->delta_disks = -1;
+ mddev->raid_disks -= 1;
+ return setup_conf(mddev);
+}
+
+static int raid5_check_reshape(struct mddev *mddev)
+{
+ /* For a 2-drive array, the layout and chunk size can be changed
+ * immediately as not restriping is needed.
+ * For larger arrays we record the new value - after validation
+ * to be used by a reshape pass.
+ */
+ struct r5conf *conf = mddev->private;
+ int new_chunk = mddev->new_chunk_sectors;
+
+ if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
+ return -EINVAL;
+ if (new_chunk > 0) {
+ if (!is_power_of_2(new_chunk))
+ return -EINVAL;
+ if (new_chunk < (PAGE_SIZE>>9))
+ return -EINVAL;
+ if (mddev->array_sectors & (new_chunk-1))
+ /* not factor of array size */
+ return -EINVAL;
+ }
+
+ /* They look valid */
+
+ if (mddev->raid_disks == 2) {
+ /* can make the change immediately */
+ if (mddev->new_layout >= 0) {
+ conf->algorithm = mddev->new_layout;
+ mddev->layout = mddev->new_layout;
+ }
+ if (new_chunk > 0) {
+ conf->chunk_sectors = new_chunk ;
+ mddev->chunk_sectors = new_chunk;
+ }
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ md_wakeup_thread(mddev->thread);
+ }
+ return check_reshape(mddev);
+}
+
+static int raid6_check_reshape(struct mddev *mddev)
+{
+ int new_chunk = mddev->new_chunk_sectors;
+
+ if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
+ return -EINVAL;
+ if (new_chunk > 0) {
+ if (!is_power_of_2(new_chunk))
+ return -EINVAL;
+ if (new_chunk < (PAGE_SIZE >> 9))
+ return -EINVAL;
+ if (mddev->array_sectors & (new_chunk-1))
+ /* not factor of array size */
+ return -EINVAL;
+ }
+
+ /* They look valid */
+ return check_reshape(mddev);
+}
+
+static void *raid5_takeover(struct mddev *mddev)
+{
+ /* raid5 can take over:
+ * raid0 - if there is only one strip zone - make it a raid4 layout
+ * raid1 - if there are two drives. We need to know the chunk size
+ * raid4 - trivial - just use a raid4 layout.
+ * raid6 - Providing it is a *_6 layout
+ */
+ if (mddev->level == 0)
+ return raid45_takeover_raid0(mddev, 5);
+ if (mddev->level == 1)
+ return raid5_takeover_raid1(mddev);
+ if (mddev->level == 4) {
+ mddev->new_layout = ALGORITHM_PARITY_N;
+ mddev->new_level = 5;
+ return setup_conf(mddev);
+ }
+ if (mddev->level == 6)
+ return raid5_takeover_raid6(mddev);
+
+ return ERR_PTR(-EINVAL);
+}
+
+static void *raid4_takeover(struct mddev *mddev)
+{
+ /* raid4 can take over:
+ * raid0 - if there is only one strip zone
+ * raid5 - if layout is right
+ */
+ if (mddev->level == 0)
+ return raid45_takeover_raid0(mddev, 4);
+ if (mddev->level == 5 &&
+ mddev->layout == ALGORITHM_PARITY_N) {
+ mddev->new_layout = 0;
+ mddev->new_level = 4;
+ return setup_conf(mddev);
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct md_personality raid5_personality;
+
+static void *raid6_takeover(struct mddev *mddev)
+{
+ /* Currently can only take over a raid5. We map the
+ * personality to an equivalent raid6 personality
+ * with the Q block at the end.
+ */
+ int new_layout;
+
+ if (mddev->pers != &raid5_personality)
+ return ERR_PTR(-EINVAL);
+ if (mddev->degraded > 1)
+ return ERR_PTR(-EINVAL);
+ if (mddev->raid_disks > 253)
+ return ERR_PTR(-EINVAL);
+ if (mddev->raid_disks < 3)
+ return ERR_PTR(-EINVAL);
+
+ switch (mddev->layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
+ break;
+ case ALGORITHM_PARITY_0:
+ new_layout = ALGORITHM_PARITY_0_6;
+ break;
+ case ALGORITHM_PARITY_N:
+ new_layout = ALGORITHM_PARITY_N;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ mddev->new_level = 6;
+ mddev->new_layout = new_layout;
+ mddev->delta_disks = 1;
+ mddev->raid_disks += 1;
+ return setup_conf(mddev);
+}
+
+static struct md_personality raid6_personality =
+{
+ .name = "raid6",
+ .level = 6,
+ .owner = THIS_MODULE,
+ .make_request = make_request,
+ .run = run,
+ .free = raid5_free,
+ .status = status,
+ .error_handler = error,
+ .hot_add_disk = raid5_add_disk,
+ .hot_remove_disk= raid5_remove_disk,
+ .spare_active = raid5_spare_active,
+ .sync_request = sync_request,
+ .resize = raid5_resize,
+ .size = raid5_size,
+ .check_reshape = raid6_check_reshape,
+ .start_reshape = raid5_start_reshape,
+ .finish_reshape = raid5_finish_reshape,
+ .quiesce = raid5_quiesce,
+ .takeover = raid6_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
+};
+static struct md_personality raid5_personality =
+{
+ .name = "raid5",
+ .level = 5,
+ .owner = THIS_MODULE,
+ .make_request = make_request,
+ .run = run,
+ .free = raid5_free,
+ .status = status,
+ .error_handler = error,
+ .hot_add_disk = raid5_add_disk,
+ .hot_remove_disk= raid5_remove_disk,
+ .spare_active = raid5_spare_active,
+ .sync_request = sync_request,
+ .resize = raid5_resize,
+ .size = raid5_size,
+ .check_reshape = raid5_check_reshape,
+ .start_reshape = raid5_start_reshape,
+ .finish_reshape = raid5_finish_reshape,
+ .quiesce = raid5_quiesce,
+ .takeover = raid5_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
+};
+
+static struct md_personality raid4_personality =
+{
+ .name = "raid4",
+ .level = 4,
+ .owner = THIS_MODULE,
+ .make_request = make_request,
+ .run = run,
+ .free = raid5_free,
+ .status = status,
+ .error_handler = error,
+ .hot_add_disk = raid5_add_disk,
+ .hot_remove_disk= raid5_remove_disk,
+ .spare_active = raid5_spare_active,
+ .sync_request = sync_request,
+ .resize = raid5_resize,
+ .size = raid5_size,
+ .check_reshape = raid5_check_reshape,
+ .start_reshape = raid5_start_reshape,
+ .finish_reshape = raid5_finish_reshape,
+ .quiesce = raid5_quiesce,
+ .takeover = raid4_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
+};
+
+static int __init raid5_init(void)
+{
+ raid5_wq = alloc_workqueue("raid5wq",
+ WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
+ if (!raid5_wq)
+ return -ENOMEM;
+ register_md_personality(&raid6_personality);
+ register_md_personality(&raid5_personality);
+ register_md_personality(&raid4_personality);
+ return 0;
+}
+
+static void raid5_exit(void)
+{
+ unregister_md_personality(&raid6_personality);
+ unregister_md_personality(&raid5_personality);
+ unregister_md_personality(&raid4_personality);
+ destroy_workqueue(raid5_wq);
+}
+
+module_init(raid5_init);
+module_exit(raid5_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
+MODULE_ALIAS("md-personality-4"); /* RAID5 */
+MODULE_ALIAS("md-raid5");
+MODULE_ALIAS("md-raid4");
+MODULE_ALIAS("md-level-5");
+MODULE_ALIAS("md-level-4");
+MODULE_ALIAS("md-personality-8"); /* RAID6 */
+MODULE_ALIAS("md-raid6");
+MODULE_ALIAS("md-level-6");
+
+/* This used to be two separate modules, they were: */
+MODULE_ALIAS("raid5");
+MODULE_ALIAS("raid6");
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
new file mode 100644
index 000000000..896d603ad
--- /dev/null
+++ b/drivers/md/raid5.h
@@ -0,0 +1,607 @@
+#ifndef _RAID5_H
+#define _RAID5_H
+
+#include <linux/raid/xor.h>
+#include <linux/dmaengine.h>
+
+/*
+ *
+ * Each stripe contains one buffer per device. Each buffer can be in
+ * one of a number of states stored in "flags". Changes between
+ * these states happen *almost* exclusively under the protection of the
+ * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
+ * these are not protected by STRIPE_ACTIVE.
+ *
+ * The flag bits that are used to represent these states are:
+ * R5_UPTODATE and R5_LOCKED
+ *
+ * State Empty == !UPTODATE, !LOCK
+ * We have no data, and there is no active request
+ * State Want == !UPTODATE, LOCK
+ * A read request is being submitted for this block
+ * State Dirty == UPTODATE, LOCK
+ * Some new data is in this buffer, and it is being written out
+ * State Clean == UPTODATE, !LOCK
+ * We have valid data which is the same as on disc
+ *
+ * The possible state transitions are:
+ *
+ * Empty -> Want - on read or write to get old data for parity calc
+ * Empty -> Dirty - on compute_parity to satisfy write/sync request.
+ * Empty -> Clean - on compute_block when computing a block for failed drive
+ * Want -> Empty - on failed read
+ * Want -> Clean - on successful completion of read request
+ * Dirty -> Clean - on successful completion of write request
+ * Dirty -> Clean - on failed write
+ * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
+ *
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
+ * all happen in b_end_io at interrupt time.
+ * Each sets the Uptodate bit before releasing the Lock bit.
+ * This leaves one multi-stage transition:
+ * Want->Dirty->Clean
+ * This is safe because thinking that a Clean buffer is actually dirty
+ * will at worst delay some action, and the stripe will be scheduled
+ * for attention after the transition is complete.
+ *
+ * There is one possibility that is not covered by these states. That
+ * is if one drive has failed and there is a spare being rebuilt. We
+ * can't distinguish between a clean block that has been generated
+ * from parity calculations, and a clean block that has been
+ * successfully written to the spare ( or to parity when resyncing).
+ * To distinguish these states we have a stripe bit STRIPE_INSYNC that
+ * is set whenever a write is scheduled to the spare, or to the parity
+ * disc if there is no spare. A sync request clears this bit, and
+ * when we find it set with no buffers locked, we know the sync is
+ * complete.
+ *
+ * Buffers for the md device that arrive via make_request are attached
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
+ * One list (bh_read) for read requests, one (bh_write) for write.
+ * There should never be more than one buffer on the two lists
+ * together, but we are not guaranteed of that so we allow for more.
+ *
+ * If a buffer is on the read list when the associated cache buffer is
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
+ * routine is called. This may happen in the end_request routine only
+ * if the buffer has just successfully been read. end_request should
+ * remove the buffers from the list and then set the Uptodate bit on
+ * the buffer. Other threads may do this only if they first check
+ * that the Uptodate bit is set. Once they have checked that they may
+ * take buffers off the read queue.
+ *
+ * When a buffer on the write list is committed for write it is copied
+ * into the cache buffer, which is then marked dirty, and moved onto a
+ * third list, the written list (bh_written). Once both the parity
+ * block and the cached buffer are successfully written, any buffer on
+ * a written list can be returned with b_end_io.
+ *
+ * The write list and read list both act as fifos. The read list,
+ * write list and written list are protected by the device_lock.
+ * The device_lock is only for list manipulations and will only be
+ * held for a very short time. It can be claimed from interrupts.
+ *
+ *
+ * Stripes in the stripe cache can be on one of two lists (or on
+ * neither). The "inactive_list" contains stripes which are not
+ * currently being used for any request. They can freely be reused
+ * for another stripe. The "handle_list" contains stripes that need
+ * to be handled in some way. Both of these are fifo queues. Each
+ * stripe is also (potentially) linked to a hash bucket in the hash
+ * table so that it can be found by sector number. Stripes that are
+ * not hashed must be on the inactive_list, and will normally be at
+ * the front. All stripes start life this way.
+ *
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
+ * device_lock.
+ * - stripes have a reference counter. If count==0, they are on a list.
+ * - If a stripe might need handling, STRIPE_HANDLE is set.
+ * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
+ * handle_list else inactive_list
+ *
+ * This, combined with the fact that STRIPE_HANDLE is only ever
+ * cleared while a stripe has a non-zero count means that if the
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
+ * the stripe is on inactive_list.
+ *
+ * The possible transitions are:
+ * activate an unhashed/inactive stripe (get_active_stripe())
+ * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
+ * activate a hashed, possibly active stripe (get_active_stripe())
+ * lockdev check-hash if(!cnt++)unlink-stripe unlockdev
+ * attach a request to an active stripe (add_stripe_bh())
+ * lockdev attach-buffer unlockdev
+ * handle a stripe (handle_stripe())
+ * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
+ * (lockdev check-buffers unlockdev) ..
+ * change-state ..
+ * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
+ * release an active stripe (release_stripe())
+ * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
+ *
+ * The refcount counts each thread that have activated the stripe,
+ * plus raid5d if it is handling it, plus one for each active request
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * The stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ * read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count. raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ * so we prevent parity dependent operations like writes and compute_blocks
+ * from starting while a check is in progress. Some dma engines can perform
+ * the check without damaging the parity block, in these cases the parity
+ * block is re-marked up to date (assuming the check was successful) and is
+ * not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ * blocks, and mark them as not up to date. This causes new read requests
+ * to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ * that block as if it is up to date. raid5_run_ops guaruntees that any
+ * operation that is dependent on the compute block result is initiated after
+ * the compute block completes.
+ */
+
+/*
+ * Operations state - intermediate states that are visible outside of
+ * STRIPE_ACTIVE.
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon. For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+ check_state_idle = 0,
+ check_state_run, /* xor parity check */
+ check_state_run_q, /* q-parity check */
+ check_state_run_pq, /* pq dual parity check */
+ check_state_check_result,
+ check_state_compute_run, /* parity repair */
+ check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+ reconstruct_state_idle = 0,
+ reconstruct_state_prexor_drain_run, /* prexor-write */
+ reconstruct_state_drain_run, /* write */
+ reconstruct_state_run, /* expand */
+ reconstruct_state_prexor_drain_result,
+ reconstruct_state_drain_result,
+ reconstruct_state_result,
+};
+
+struct stripe_head {
+ struct hlist_node hash;
+ struct list_head lru; /* inactive_list or handle_list */
+ struct llist_node release_list;
+ struct r5conf *raid_conf;
+ short generation; /* increments with every
+ * reshape */
+ sector_t sector; /* sector of this row */
+ short pd_idx; /* parity disk index */
+ short qd_idx; /* 'Q' disk index for raid6 */
+ short ddf_layout;/* use DDF ordering to calculate Q */
+ short hash_lock_index;
+ unsigned long state; /* state flags */
+ atomic_t count; /* nr of active thread/requests */
+ int bm_seq; /* sequence number for bitmap flushes */
+ int disks; /* disks in stripe */
+ int overwrite_disks; /* total overwrite disks in stripe,
+ * this is only checked when stripe
+ * has STRIPE_BATCH_READY
+ */
+ enum check_states check_state;
+ enum reconstruct_states reconstruct_state;
+ spinlock_t stripe_lock;
+ int cpu;
+ struct r5worker_group *group;
+
+ struct stripe_head *batch_head; /* protected by stripe lock */
+ spinlock_t batch_lock; /* only header's lock is useful */
+ struct list_head batch_list; /* protected by head's batch lock*/
+ /**
+ * struct stripe_operations
+ * @target - STRIPE_OP_COMPUTE_BLK target
+ * @target2 - 2nd compute target in the raid6 case
+ * @zero_sum_result - P and Q verification flags
+ * @request - async service request flags for raid_run_ops
+ */
+ struct stripe_operations {
+ int target, target2;
+ enum sum_check_flags zero_sum_result;
+ } ops;
+ struct r5dev {
+ /* rreq and rvec are used for the replacement device when
+ * writing data to both devices.
+ */
+ struct bio req, rreq;
+ struct bio_vec vec, rvec;
+ struct page *page, *orig_page;
+ struct bio *toread, *read, *towrite, *written;
+ sector_t sector; /* sector of this page */
+ unsigned long flags;
+ } dev[1]; /* allocated with extra space depending of RAID geometry */
+};
+
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ * for handle_stripe.
+ */
+struct stripe_head_state {
+ /* 'syncing' means that we need to read all devices, either
+ * to check/correct parity, or to reconstruct a missing device.
+ * 'replacing' means we are replacing one or more drives and
+ * the source is valid at this point so we don't need to
+ * read all devices, just the replacement targets.
+ */
+ int syncing, expanding, expanded, replacing;
+ int locked, uptodate, to_read, to_write, failed, written;
+ int to_fill, compute, req_compute, non_overwrite;
+ int failed_num[2];
+ int p_failed, q_failed;
+ int dec_preread_active;
+ unsigned long ops_request;
+
+ struct bio *return_bi;
+ struct md_rdev *blocked_rdev;
+ int handle_bad_blocks;
+};
+
+/* Flags for struct r5dev.flags */
+enum r5dev_flags {
+ R5_UPTODATE, /* page contains current data */
+ R5_LOCKED, /* IO has been submitted on "req" */
+ R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
+ R5_OVERWRITE, /* towrite covers whole page */
+/* and some that are internal to handle_stripe */
+ R5_Insync, /* rdev && rdev->in_sync at start */
+ R5_Wantread, /* want to schedule a read */
+ R5_Wantwrite,
+ R5_Overlap, /* There is a pending overlapping request
+ * on this block */
+ R5_ReadNoMerge, /* prevent bio from merging in block-layer */
+ R5_ReadError, /* seen a read error here recently */
+ R5_ReWrite, /* have tried to over-write the readerror */
+
+ R5_Expanded, /* This block now has post-expand data */
+ R5_Wantcompute, /* compute_block in progress treat as
+ * uptodate
+ */
+ R5_Wantfill, /* dev->toread contains a bio that needs
+ * filling
+ */
+ R5_Wantdrain, /* dev->towrite needs to be drained */
+ R5_WantFUA, /* Write should be FUA */
+ R5_SyncIO, /* The IO is sync */
+ R5_WriteError, /* got a write error - need to record it */
+ R5_MadeGood, /* A bad block has been fixed by writing to it */
+ R5_ReadRepl, /* Will/did read from replacement rather than orig */
+ R5_MadeGoodRepl,/* A bad block on the replacement device has been
+ * fixed by writing to it */
+ R5_NeedReplace, /* This device has a replacement which is not
+ * up-to-date at this stripe. */
+ R5_WantReplace, /* We need to update the replacement, we have read
+ * data in, and now is a good time to write it out.
+ */
+ R5_Discard, /* Discard the stripe */
+ R5_SkipCopy, /* Don't copy data from bio to stripe cache */
+};
+
+/*
+ * Stripe state
+ */
+enum {
+ STRIPE_ACTIVE,
+ STRIPE_HANDLE,
+ STRIPE_SYNC_REQUESTED,
+ STRIPE_SYNCING,
+ STRIPE_INSYNC,
+ STRIPE_REPLACED,
+ STRIPE_PREREAD_ACTIVE,
+ STRIPE_DELAYED,
+ STRIPE_DEGRADED,
+ STRIPE_BIT_DELAY,
+ STRIPE_EXPANDING,
+ STRIPE_EXPAND_SOURCE,
+ STRIPE_EXPAND_READY,
+ STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */
+ STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
+ STRIPE_BIOFILL_RUN,
+ STRIPE_COMPUTE_RUN,
+ STRIPE_OPS_REQ_PENDING,
+ STRIPE_ON_UNPLUG_LIST,
+ STRIPE_DISCARD,
+ STRIPE_ON_RELEASE_LIST,
+ STRIPE_BATCH_READY,
+ STRIPE_BATCH_ERR,
+ STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
+ * to batch yet.
+ */
+};
+
+#define STRIPE_EXPAND_SYNC_FLAGS \
+ ((1 << STRIPE_EXPAND_SOURCE) |\
+ (1 << STRIPE_EXPAND_READY) |\
+ (1 << STRIPE_EXPANDING) |\
+ (1 << STRIPE_SYNC_REQUESTED))
+/*
+ * Operation request flags
+ */
+enum {
+ STRIPE_OP_BIOFILL,
+ STRIPE_OP_COMPUTE_BLK,
+ STRIPE_OP_PREXOR,
+ STRIPE_OP_BIODRAIN,
+ STRIPE_OP_RECONSTRUCT,
+ STRIPE_OP_CHECK,
+};
+
+/*
+ * RAID parity calculation preferences
+ */
+enum {
+ PARITY_DISABLE_RMW = 0,
+ PARITY_ENABLE_RMW,
+ PARITY_PREFER_RMW,
+};
+
+/*
+ * Pages requested from set_syndrome_sources()
+ */
+enum {
+ SYNDROME_SRC_ALL,
+ SYNDROME_SRC_WANT_DRAIN,
+ SYNDROME_SRC_WRITTEN,
+};
+/*
+ * Plugging:
+ *
+ * To improve write throughput, we need to delay the handling of some
+ * stripes until there has been a chance that several write requests
+ * for the one stripe have all been collected.
+ * In particular, any write request that would require pre-reading
+ * is put on a "delayed" queue until there are no stripes currently
+ * in a pre-read phase. Further, if the "delayed" queue is empty when
+ * a stripe is put on it then we "plug" the queue and do not process it
+ * until an unplug call is made. (the unplug_io_fn() is called).
+ *
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
+ * it to the count of prereading stripes.
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
+ * clear the PREREAD_ACTIVE flag and decrement the count
+ * Whenever the 'handle' queue is empty and the device is not plugged, we
+ * move any strips from delayed to handle and clear the DELAYED flag and set
+ * PREREAD_ACTIVE.
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
+ * HANDLE gets cleared if stripe_handle leaves nothing locked.
+ */
+
+struct disk_info {
+ struct md_rdev *rdev, *replacement;
+};
+
+/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
+ * This is because we sometimes take all the spinlocks
+ * and creating that much locking depth can cause
+ * problems.
+ */
+#define NR_STRIPE_HASH_LOCKS 8
+#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
+
+struct r5worker {
+ struct work_struct work;
+ struct r5worker_group *group;
+ struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
+ bool working;
+};
+
+struct r5worker_group {
+ struct list_head handle_list;
+ struct r5conf *conf;
+ struct r5worker *workers;
+ int stripes_cnt;
+};
+
+struct r5conf {
+ struct hlist_head *stripe_hashtbl;
+ /* only protect corresponding hash list and inactive_list */
+ spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
+ struct mddev *mddev;
+ int chunk_sectors;
+ int level, algorithm, rmw_level;
+ int max_degraded;
+ int raid_disks;
+ int max_nr_stripes;
+ int min_nr_stripes;
+
+ /* reshape_progress is the leading edge of a 'reshape'
+ * It has value MaxSector when no reshape is happening
+ * If delta_disks < 0, it is the last sector we started work on,
+ * else is it the next sector to work on.
+ */
+ sector_t reshape_progress;
+ /* reshape_safe is the trailing edge of a reshape. We know that
+ * before (or after) this address, all reshape has completed.
+ */
+ sector_t reshape_safe;
+ int previous_raid_disks;
+ int prev_chunk_sectors;
+ int prev_algo;
+ short generation; /* increments with every reshape */
+ seqcount_t gen_lock; /* lock against generation changes */
+ unsigned long reshape_checkpoint; /* Time we last updated
+ * metadata */
+ long long min_offset_diff; /* minimum difference between
+ * data_offset and
+ * new_data_offset across all
+ * devices. May be negative,
+ * but is closest to zero.
+ */
+
+ struct list_head handle_list; /* stripes needing handling */
+ struct list_head hold_list; /* preread ready stripes */
+ struct list_head delayed_list; /* stripes that have plugged requests */
+ struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
+ struct bio *retry_read_aligned; /* currently retrying aligned bios */
+ struct bio *retry_read_aligned_list; /* aligned bios retry list */
+ atomic_t preread_active_stripes; /* stripes with scheduled io */
+ atomic_t active_aligned_reads;
+ atomic_t pending_full_writes; /* full write backlog */
+ int bypass_count; /* bypassed prereads */
+ int bypass_threshold; /* preread nice */
+ int skip_copy; /* Don't copy data from bio to stripe cache */
+ struct list_head *last_hold; /* detect hold_list promotions */
+
+ atomic_t reshape_stripes; /* stripes with pending writes for reshape */
+ /* unfortunately we need two cache names as we temporarily have
+ * two caches.
+ */
+ int active_name;
+ char cache_name[2][32];
+ struct kmem_cache *slab_cache; /* for allocating stripes */
+
+ int seq_flush, seq_write;
+ int quiesce;
+
+ int fullsync; /* set to 1 if a full sync is needed,
+ * (fresh device added).
+ * Cleared when a sync completes.
+ */
+ int recovery_disabled;
+ /* per cpu variables */
+ struct raid5_percpu {
+ struct page *spare_page; /* Used when checking P/Q in raid6 */
+ struct flex_array *scribble; /* space for constructing buffer
+ * lists and performing address
+ * conversions
+ */
+ } __percpu *percpu;
+#ifdef CONFIG_HOTPLUG_CPU
+ struct notifier_block cpu_notify;
+#endif
+
+ /*
+ * Free stripes pool
+ */
+ atomic_t active_stripes;
+ struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
+ atomic_t empty_inactive_list_nr;
+ struct llist_head released_stripes;
+ wait_queue_head_t wait_for_stripe;
+ wait_queue_head_t wait_for_overlap;
+ unsigned long cache_state;
+#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,
+ * waiting for 25% to be free
+ */
+#define R5_ALLOC_MORE 2 /* It might help to allocate another
+ * stripe.
+ */
+#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate
+ * more until at least one has been
+ * released. This avoids flooding
+ * the cache.
+ */
+ struct shrinker shrinker;
+ int pool_size; /* number of disks in stripeheads in pool */
+ spinlock_t device_lock;
+ struct disk_info *disks;
+
+ /* When taking over an array from a different personality, we store
+ * the new thread here until we fully activate the array.
+ */
+ struct md_thread *thread;
+ struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
+ struct r5worker_group *worker_groups;
+ int group_cnt;
+ int worker_cnt_per_group;
+};
+
+
+/*
+ * Our supported algorithms
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */
+#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */
+#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */
+#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */
+
+/* Define non-rotating (raid4) algorithms. These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
+
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+
+#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */
+
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6 16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
+#define ALGORITHM_LEFT_SYMMETRIC_6 18
+#define ALGORITHM_RIGHT_SYMMETRIC_6 19
+#define ALGORITHM_PARITY_0_6 20
+#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
+
+static inline int algorithm_valid_raid5(int layout)
+{
+ return (layout >= 0) &&
+ (layout <= 5);
+}
+static inline int algorithm_valid_raid6(int layout)
+{
+ return (layout >= 0 && layout <= 5)
+ ||
+ (layout >= 8 && layout <= 10)
+ ||
+ (layout >= 16 && layout <= 20);
+}
+
+static inline int algorithm_is_DDF(int layout)
+{
+ return layout >= 8 && layout <= 10;
+}
+
+extern void md_raid5_kick_device(struct r5conf *conf);
+extern int raid5_set_cache_size(struct mddev *mddev, int size);
+#endif